diff --git a/.github/actions/setup-macos-builder/action.yaml b/.github/actions/setup-macos-builder/action.yaml deleted file mode 100644 index fffdab160b..0000000000 --- a/.github/actions/setup-macos-builder/action.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Prepare Rust Builder for MacOS -description: 'Prepare Rust Build Environment for MacOS' -inputs: - rust-version: - description: 'version of rust to install (e.g. stable)' - required: true - default: 'stable' -runs: - using: "composite" - steps: - - name: Install protobuf compiler - shell: bash - run: | - mkdir -p $HOME/d/protoc - cd $HOME/d/protoc - export PROTO_ZIP="protoc-29.1-osx-x86_64.zip" - curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.1/$PROTO_ZIP - unzip $PROTO_ZIP - echo "$HOME/d/protoc/bin" >> $GITHUB_PATH - export PATH=$PATH:$HOME/d/protoc/bin - protoc --version - - name: Setup Rust toolchain - shell: bash - run: | - rustup update stable - rustup toolchain install stable - rustup default stable - rustup component add rustfmt - - name: Configure rust runtime env - uses: ./.github/actions/setup-rust-runtime diff --git a/.github/actions/setup-rust-runtime/action.yaml b/.github/actions/setup-rust-runtime/action.yaml index b6fb2c898b..e0341de93b 100644 --- a/.github/actions/setup-rust-runtime/action.yaml +++ b/.github/actions/setup-rust-runtime/action.yaml @@ -20,10 +20,6 @@ description: 'Setup Rust Runtime Environment' runs: using: "composite" steps: - # https://github.com/apache/datafusion/issues/15535 - # disabled because neither version nor git hash works with apache github policy - #- name: Run sccache-cache - # uses: mozilla-actions/sccache-action@65101d47ea8028ed0c98a1cdea8dd9182e9b5133 # v0.0.8 - name: Configure runtime env shell: bash # do not produce debug symbols to keep memory usage down @@ -32,11 +28,6 @@ runs: # # Set debuginfo=line-tables-only as debuginfo=0 causes immensely slow build # See for more details: https://github.com/rust-lang/rust/issues/119560 - # - # readd the following to the run below once sccache-cache is re-enabled - # echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV - # echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV run: | echo "RUST_BACKTRACE=1" >> $GITHUB_ENV echo "RUSTFLAGS=-C debuginfo=line-tables-only -C incremental=false" >> $GITHUB_ENV - diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7c2b7e3a54..ec3368da9a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,7 +20,7 @@ updates: - package-ecosystem: cargo directory: "/" schedule: - interval: daily + interval: weekly target-branch: main labels: [auto-dependencies] ignore: diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index b26ac8c02e..d05141c2e9 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,10 +42,8 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@2fdc5fd6ac805b0f8256893bd4c807bcb666af00 # v2 + uses: taiki-e/install-action@d0f4f69b07c0804d1003ca9a5a5f853423872ed9 # v2 with: tool: cargo-audit - name: Run audit check - # Ignored until https://github.com/apache/datafusion/issues/15571 - # ignored py03 warning until arrow 55 upgrade - run: cargo audit --ignore RUSTSEC-2024-0370 --ignore RUSTSEC-2025-0020 + run: cargo audit diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 97f1b4e5b2..1035772904 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -33,7 +33,10 @@ jobs: name: Check License Header steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - uses: korandoru/hawkeye@c3d9887d4b48e5b6c77306934606a4ff4623a2d3 # v6.2.0 + - name: Install HawkEye + run: cargo install hawkeye --version 6.2.0 --locked --profile dev + - name: Run license header check + run: ci/scripts/license_header.sh prettier: name: Use prettier to check formatting of documents diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 30fd684a01..154843944c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -55,7 +55,7 @@ jobs: with: rust-version: stable - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: shared-key: "amd-ci-check" # this job uses it's own cache becase check has a separate cache and we need it to be fast as it blocks other jobs save-if: ${{ github.ref_name == 'main' }} @@ -108,7 +108,7 @@ jobs: with: rust-version: stable - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: save-if: false # set in linux-test shared-key: "amd-ci" @@ -176,7 +176,7 @@ jobs: with: rust-version: stable - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: save-if: false # set in linux-test shared-key: "amd-ci" @@ -281,7 +281,7 @@ jobs: with: rust-version: stable - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: save-if: ${{ github.ref_name == 'main' }} shared-key: "amd-ci" @@ -308,10 +308,6 @@ jobs: name: cargo test datafusion-cli (amd64) needs: linux-build-lib runs-on: ubuntu-latest - # should be uncommented once https://github.com/apache/datafusion/pull/16644 is merged - # and cache should be added - # container: - # image: amd64/rust steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -319,6 +315,11 @@ jobs: fetch-depth: 1 - name: Setup Rust toolchain run: rustup toolchain install stable + - name: Rust Dependency Cache + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 + with: + save-if: false # set in linux-test + shared-key: "amd-ci" - name: Run tests (excluding doctests) env: RUST_BACKTRACE: 1 @@ -348,7 +349,7 @@ jobs: with: rust-version: stable - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: save-if: ${{ github.ref_name == 'main' }} shared-key: "amd-ci-linux-test-example" @@ -411,14 +412,16 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@2fdc5fd6ac805b0f8256893bd4c807bcb666af00 # v2 + uses: taiki-e/install-action@d0f4f69b07c0804d1003ca9a5a5f853423872ed9 # v2 with: tool: wasm-pack - name: Run tests with headless mode working-directory: ./datafusion/wasmtest run: | - RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack test --headless --firefox - RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack test --headless --chrome --chromedriver $CHROMEWEBDRIVER/chromedriver + # debuginfo=none because CI tests weren't completing successfully after this upstream PR: + # https://github.com/wasm-bindgen/wasm-bindgen/pull/4635 + RUSTFLAGS='--cfg getrandom_backend="wasm_js" -C debuginfo=none' wasm-pack test --headless --firefox + RUSTFLAGS='--cfg getrandom_backend="wasm_js" -C debuginfo=none' wasm-pack test --headless --chrome --chromedriver $CHROMEWEBDRIVER/chromedriver # verify that the benchmark queries return the correct results verify-benchmark-results: @@ -533,22 +536,6 @@ jobs: # export PATH=$PATH:$HOME/d/protoc/bin # cargo test --lib --tests --bins --features avro,json,backtrace - # Commenting out intel mac build as so few users would ever use it - # Details: https://github.com/apache/datafusion/issues/13846 - # macos: - # name: cargo test (macos) - # runs-on: macos-latest - # steps: - # - uses: actions/checkout@v4 - # with: - # submodules: true - # fetch-depth: 1 - # - name: Setup Rust toolchain - # uses: ./.github/actions/setup-macos-builder - # - name: Run tests (excluding doctests) - # shell: bash - # run: cargo test run --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace - macos-aarch64: name: cargo test (macos-aarch64) runs-on: macos-14 @@ -680,7 +667,7 @@ jobs: - name: Install Clippy run: rustup component add clippy - name: Rust Dependency Cache - uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0 + uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1 # v2.8.1 with: save-if: ${{ github.ref_name == 'main' }} shared-key: "amd-ci-clippy" @@ -752,7 +739,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@2fdc5fd6ac805b0f8256893bd4c807bcb666af00 # v2 + uses: taiki-e/install-action@d0f4f69b07c0804d1003ca9a5a5f853423872ed9 # v2 with: tool: cargo-msrv diff --git a/.gitignore b/.gitignore index 4ae32925d9..8466a72ada 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ docker_cache *.orig .*.swp .*.swo +*.pending-snap venv/* diff --git a/Cargo.lock b/Cargo.lock index 9dd40437fb..fe0f93e1f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -187,9 +187,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.99" +version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "apache-avro" @@ -212,8 +212,8 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.27.2", + "strum_macros 0.27.2", "thiserror", "uuid", "xz2", @@ -234,9 +234,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" dependencies = [ "arrow-arith", "arrow-array", @@ -258,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" dependencies = [ "arrow-array", "arrow-buffer", @@ -272,9 +272,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -283,15 +283,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "num", ] [[package]] name = "arrow-buffer" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" dependencies = [ "bytes", "half", @@ -300,9 +300,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" dependencies = [ "arrow-array", "arrow-buffer", @@ -321,9 +321,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" dependencies = [ "arrow-array", "arrow-cast", @@ -336,9 +336,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -348,9 +348,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" +checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" dependencies = [ "arrow-arith", "arrow-array", @@ -368,21 +368,22 @@ dependencies = [ "futures", "once_cell", "paste", - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", "tonic", ] [[package]] name = "arrow-ipc" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", "lz4_flex", "zstd", @@ -390,9 +391,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" dependencies = [ "arrow-array", "arrow-buffer", @@ -401,7 +402,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.11.3", + "indexmap 2.11.4", "lexical-core", "memchr", "num", @@ -412,9 +413,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" dependencies = [ "arrow-array", "arrow-buffer", @@ -425,9 +426,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" +checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" dependencies = [ "arrow-array", "arrow-data", @@ -437,9 +438,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" dependencies = [ "arrow-array", "arrow-buffer", @@ -450,20 +451,20 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "serde", "serde_json", ] [[package]] name = "arrow-select" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -475,9 +476,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" dependencies = [ "arrow-array", "arrow-buffer", @@ -615,9 +616,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.13.3" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba" +checksum = "94b8ff6c09cd57b16da53641caa860168b88c172a5ee163b0288d3d6eea12786" dependencies = [ "aws-lc-sys", "zeroize", @@ -625,9 +626,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" +checksum = "0e44d16778acaf6a9ec9899b92cebd65580b83f685446bf2e1f5d3d732f99dcd" dependencies = [ "bindgen", "cc", @@ -662,9 +663,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.83.0" +version = "1.84.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cd43af212d2a1c4dedff6f044d7e1961e5d9e7cfe773d70f31d9842413886" +checksum = "357a841807f6b52cb26123878b3326921e2a25faca412fabdd32bd35b7edd5d3" dependencies = [ "aws-credential-types", "aws-runtime", @@ -684,9 +685,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.84.0" +version = "1.85.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20ec4a95bd48e0db7a424356a161f8d87bd6a4f0af37204775f0da03d9e39fc3" +checksum = "67e05f33b6c9026fecfe9b3b6740f34d41bc6ff641a6a32dabaab60209245b75" dependencies = [ "aws-credential-types", "aws-runtime", @@ -706,9 +707,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.85.0" +version = "1.86.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410309ad0df4606bc721aff0d89c3407682845453247213a0ccc5ff8801ee107" +checksum = "e7d835f123f307cafffca7b9027c14979f1d403b417d8541d67cf252e8a21e35" dependencies = [ "aws-credential-types", "aws-runtime", @@ -834,9 +835,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.1" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3946acbe1ead1301ba6862e712c7903ca9bb230bdf1fbd1b5ac54158ef2ab1f" +checksum = "4fa63ad37685ceb7762fa4d73d06f1d5493feb88e3f27259b9ed277f4c01b185" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -1017,25 +1018,22 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.5" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "cexpr", "clang-sys", - "itertools 0.12.1", - "lazy_static", - "lazycell", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", + "rustc-hash", "shlex", "syn 2.0.106", - "which", ] [[package]] @@ -1046,9 +1044,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.1" +version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" [[package]] name = "bitvec" @@ -1145,9 +1143,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.7.1" +version = "3.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "537c317ddf588aab15c695bf92cf55dec159b93221c074180ca3e0e5a94da415" +checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" dependencies = [ "bon-macros", "rustversion", @@ -1155,11 +1153,11 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.7.1" +version = "3.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5abbf2d4a4c6896197c9de13d6d7cb7eff438c63dacde1dde980569cb00248" +checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" dependencies = [ - "darling 0.21.3", + "darling", "ident_case", "prettyplease", "proc-macro2", @@ -1193,9 +1191,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.1" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1308,10 +1306,11 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.32" +version = "1.2.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e" +checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -1328,9 +1327,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" [[package]] name = "cfg_aliases" @@ -1397,7 +1396,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading 0.8.8", + "libloading 0.8.9", ] [[package]] @@ -1413,9 +1412,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.47" +version = "4.5.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" +checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" dependencies = [ "clap_builder", "clap_derive", @@ -1423,9 +1422,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.47" +version = "4.5.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" +checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" dependencies = [ "anstream", "anstyle", @@ -1477,11 +1476,12 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "comfy-table" -version = "7.1.4" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width 0.2.1", ] @@ -1499,15 +1499,15 @@ dependencies = [ [[package]] name = "console" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e09ced7ebbccb63b4c65413d821f2e00ce54c5ca4514ddc6b3c892fdbcbc69d" +checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" dependencies = [ "encode_unicode", "libc", "once_cell", "unicode-width 0.2.1", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -1542,9 +1542,9 @@ dependencies = [ [[package]] name = "const_panic" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb8a602185c3c95b52f86dc78e55a6df9a287a7a93ddbcf012509930880cf879" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" dependencies = [ "typewit", ] @@ -1573,18 +1573,18 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core_extensions" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" dependencies = [ "core_extensions_proc_macros", ] [[package]] name = "core_extensions_proc_macros" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" [[package]] name = "cpufeatures" @@ -1613,7 +1613,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.47", + "clap 4.5.48", "criterion-plot", "futures", "is-terminal", @@ -1735,38 +1735,14 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" -[[package]] -name = "darling" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" -dependencies = [ - "darling_core 0.20.11", - "darling_macro 0.20.11", -] - [[package]] name = "darling" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "darling_core 0.21.3", - "darling_macro 0.21.3", -] - -[[package]] -name = "darling_core" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim", - "syn 2.0.106", + "darling_core", + "darling_macro", ] [[package]] @@ -1783,24 +1759,13 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "darling_macro" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" -dependencies = [ - "darling_core 0.20.11", - "quote", - "syn 2.0.106", -] - [[package]] name = "darling_macro" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "darling_core 0.21.3", + "darling_core", "quote", "syn 2.0.106", ] @@ -1968,7 +1933,7 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", - "clap 4.5.47", + "clap 4.5.48", "ctor", "datafusion", "dirs", @@ -2002,7 +1967,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "hex", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", "libc", "log", @@ -2175,7 +2140,7 @@ dependencies = [ "mimalloc", "nix", "object_store", - "prost", + "prost 0.13.5", "rand 0.9.2", "serde_json", "tempfile", @@ -2224,7 +2189,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "env_logger", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", "itertools 0.14.0", "paste", @@ -2239,7 +2204,7 @@ version = "50.0.0" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.11.3", + "indexmap 2.11.4", "itertools 0.14.0", "paste", ] @@ -2260,7 +2225,7 @@ dependencies = [ "doc-comment", "futures", "log", - "prost", + "prost 0.13.5", "semver", "tokio", ] @@ -2395,7 +2360,7 @@ dependencies = [ name = "datafusion-macros" version = "50.0.0" dependencies = [ - "datafusion-expr", + "datafusion-doc", "quote", "syn 2.0.106", ] @@ -2418,7 +2383,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "env_logger", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", "itertools 0.14.0", "log", @@ -2442,7 +2407,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", "itertools 0.14.0", "parking_lot", @@ -2520,7 +2485,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", "itertools 0.14.0", "log", @@ -2549,7 +2514,7 @@ dependencies = [ "object_store", "pbjson", "pretty_assertions", - "prost", + "prost 0.13.5", "serde", "serde_json", "tokio", @@ -2563,7 +2528,7 @@ dependencies = [ "datafusion-common", "doc-comment", "pbjson", - "prost", + "prost 0.13.5", "serde", ] @@ -2622,6 +2587,7 @@ version = "50.0.0" dependencies = [ "arrow", "bigdecimal", + "chrono", "ctor", "datafusion-common", "datafusion-expr", @@ -2630,8 +2596,9 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-window", "env_logger", - "indexmap 2.11.3", + "indexmap 2.11.4", "insta", + "itertools 0.14.0", "log", "paste", "recursive", @@ -2649,7 +2616,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.47", + "clap 4.5.48", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -2687,7 +2654,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost", + "prost 0.13.5", "serde_json", "substrait", "tokio", @@ -2718,9 +2685,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.4.0" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +checksum = "d630bccd429a5bb5a64b5e94f693bfc48c9f8566418fda4c494cc94f911f87cc" dependencies = [ "powerfmt", "serde", @@ -2761,7 +2728,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -2900,12 +2867,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -2950,7 +2917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix 1.0.8", + "rustix", "windows-sys 0.59.0", ] @@ -2986,16 +2953,22 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.25" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" +checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] +[[package]] +name = "find-msvc-tools" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959" + [[package]] name = "fixedbitset" version = "0.5.7" @@ -3008,7 +2981,7 @@ version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "rustc_version", ] @@ -3046,9 +3019,9 @@ dependencies = [ [[package]] name = "fs-err" -version = "3.1.1" +version = "3.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d7be93788013f265201256d58f04936a8079ad5dc898743aa20525f503b683" +checksum = "44f150ffc8782f35521cec2b23727707cb4045706ba3c854e86bef66b3a8cdbd" dependencies = [ "autocfg", ] @@ -3164,16 +3137,16 @@ dependencies = [ name = "gen" version = "0.1.0" dependencies = [ - "pbjson-build", - "prost-build", + "pbjson-build 0.8.0", + "prost-build 0.14.1", ] [[package]] name = "gen-common" version = "0.1.0" dependencies = [ - "pbjson-build", - "prost-build", + "pbjson-build 0.8.0", + "prost-build 0.14.1", ] [[package]] @@ -3218,7 +3191,7 @@ dependencies = [ "js-sys", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi 0.14.7+wasi-0.2.4", "wasm-bindgen", ] @@ -3259,7 +3232,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.11.3", + "indexmap 2.11.4", "slab", "tokio", "tokio-util", @@ -3307,6 +3280,12 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "heck" version = "0.3.3" @@ -3422,19 +3401,20 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "humantime" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" dependencies = [ + "atomic-waker", "bytes", "futures-channel", - "futures-util", + "futures-core", "h2", "http 1.3.1", "http-body 1.0.1", @@ -3442,6 +3422,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", + "pin-utils", "smallvec", "tokio", "want", @@ -3494,9 +3475,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" +checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" dependencies = [ "base64 0.22.1", "bytes", @@ -3533,9 +3514,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.63" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -3543,7 +3524,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core", + "windows-core 0.62.0", ] [[package]] @@ -3681,12 +3662,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.11.3" +version = "2.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92119844f513ffa41556430369ab02c295a3578af21cf945caa3e9e0c2481ac3" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ "equivalent", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "serde", "serde_core", ] @@ -3697,7 +3678,7 @@ version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70a646d946d06bedbbc4cac4c218acf4bbf2d87757a784857025f4d447e4e1cd" dependencies = [ - "console 0.16.0", + "console 0.16.1", "portable-atomic", "unicode-width 0.2.1", "unit-prefix", @@ -3744,11 +3725,11 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "io-uring" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "cfg-if", "libc", ] @@ -3795,15 +3776,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.13.0" @@ -3854,9 +3826,9 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ "getrandom 0.3.3", "libc", @@ -3864,9 +3836,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.78" +version = "0.3.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0b063578492ceec17683ef2f8c5e89121fbd0b172cbc280635ab7567db2738" +checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" dependencies = [ "once_cell", "wasm-bindgen", @@ -3878,17 +3850,11 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "lexical-core" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -3899,53 +3865,46 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] @@ -3956,9 +3915,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.175" +version = "0.2.176" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" [[package]] name = "libloading" @@ -3972,12 +3931,12 @@ dependencies = [ [[package]] name = "libloading" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-targets 0.53.3", + "windows-link 0.2.0", ] [[package]] @@ -3999,11 +3958,11 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" +checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "libc", "redox_syscall 0.5.17", ] @@ -4016,30 +3975,24 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.47", + "clap 4.5.48", "escape8259", ] [[package]] name = "libz-rs-sys" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" dependencies = [ "zlib-rs", ] [[package]] name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - -[[package]] -name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" @@ -4192,7 +4145,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "cfg-if", "cfg_aliases", "libc", @@ -4313,7 +4266,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", ] [[package]] @@ -4337,9 +4290,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efc4f07659e11cd45a341cd24d71e683e3be65d9ff1f8150061678fe60437496" +checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" dependencies = [ "async-trait", "base64 0.22.1", @@ -4448,9 +4401,9 @@ dependencies = [ [[package]] name = "parquet" -version = "56.0.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4467,7 +4420,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.0", "lz4_flex", "num", "num-bigint", @@ -4532,8 +4485,20 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck 0.5.0", "itertools 0.13.0", - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", +] + +[[package]] +name = "pbjson-build" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" +dependencies = [ + "heck 0.5.0", + "itertools 0.14.0", + "prost 0.14.1", + "prost-types 0.14.1", ] [[package]] @@ -4545,9 +4510,9 @@ dependencies = [ "bytes", "chrono", "pbjson", - "pbjson-build", - "prost", - "prost-build", + "pbjson-build 0.7.0", + "prost 0.13.5", + "prost-build 0.13.5", "serde", ] @@ -4564,7 +4529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.11.3", + "indexmap 2.11.4", ] [[package]] @@ -4575,42 +4540,43 @@ checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.11.3", + "indexmap 2.11.4", "serde", ] [[package]] name = "phf" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "phf_shared 0.11.3", + "phf_shared 0.12.1", ] [[package]] name = "phf" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ - "phf_shared 0.12.1", + "phf_shared 0.13.1", + "serde", ] [[package]] name = "phf_shared" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ "siphasher", ] [[package]] name = "phf_shared" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" dependencies = [ "siphasher", ] @@ -4698,9 +4664,9 @@ dependencies = [ [[package]] name = "postgres-derive" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69700ea4603c5ef32d447708e6a19cd3e8ac197a000842e97f527daea5e4175f" +checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -4710,9 +4676,9 @@ dependencies = [ [[package]] name = "postgres-protocol" -version = "0.6.8" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76ff0abab4a9b844b93ef7b81f1efc0a366062aaef2cd702c76256b5dc075c54" +checksum = "fbef655056b916eb868048276cfd5d6a7dea4f81560dfd047f97c8c6fe3fcfd4" dependencies = [ "base64 0.22.1", "byteorder", @@ -4728,9 +4694,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613283563cd90e1dfc3518d548caee47e0e725455ed619881f5cf21f36de4b48" +checksum = "77a120daaabfcb0e324d5bf6e411e9222994cb3795c79943a0ef28ed27ea76e4" dependencies = [ "bytes", "chrono", @@ -4741,9 +4707,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" dependencies = [ "zerovec", ] @@ -4775,9 +4741,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.36" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", "syn 2.0.106", @@ -4785,9 +4751,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ "toml_edit", ] @@ -4818,9 +4784,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.97" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" dependencies = [ "unicode-ident", ] @@ -4832,7 +4798,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.5", +] + +[[package]] +name = "prost" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d" +dependencies = [ + "bytes", + "prost-derive 0.14.1", ] [[package]] @@ -4848,8 +4824,28 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", + "regex", + "syn 2.0.106", + "tempfile", +] + +[[package]] +name = "prost-build" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6c3320f9abac597dcbc668774ef006702672474aad53c6d596b62e487b40b1" +dependencies = [ + "heck 0.5.0", + "itertools 0.14.0", + "log", + "multimap", + "once_cell", + "petgraph 0.7.1", + "prettyplease", + "prost 0.14.1", + "prost-types 0.14.1", "regex", "syn 2.0.106", "tempfile", @@ -4868,13 +4864,35 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "prost-derive" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9120690fafc389a67ba3803df527d0ec9cbbc9cc45e4cc20b332996dfb672425" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "prost-types" version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ - "prost", + "prost 0.13.5", +] + +[[package]] +name = "prost-types" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72" +dependencies = [ + "prost 0.14.1", ] [[package]] @@ -4985,9 +5003,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.38.1" +version = "0.38.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9845d9dccf565065824e69f9f235fafba1587031eda353c1f1561cd6a6be78f4" +checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89" dependencies = [ "memchr", "serde", @@ -4995,18 +5013,18 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ "bytes", "cfg_aliases", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", - "socket2 0.5.10", + "socket2 0.6.0", "thiserror", "tokio", "tracing", @@ -5015,16 +5033,16 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", "getrandom 0.3.3", "lru-slab", "rand 0.9.2", "ring", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "rustls-pki-types", "slab", @@ -5036,23 +5054,23 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcebb1209ee276352ef14ff8732e24cc2b02bbac986cd74a4c81bcb2f9881970" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.0", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.40" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" dependencies = [ "proc-macro2", ] @@ -5203,7 +5221,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", ] [[package]] @@ -5239,9 +5257,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.2" +version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" dependencies = [ "aho-corasick", "memchr", @@ -5251,9 +5269,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" dependencies = [ "aho-corasick", "memchr", @@ -5262,9 +5280,9 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30" [[package]] name = "regex-syntax" @@ -5455,12 +5473,6 @@ version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -5478,35 +5490,22 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags 2.9.1", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - -[[package]] -name = "rustix" -version = "1.0.8" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "errno", "libc", - "linux-raw-sys 0.9.4", - "windows-sys 0.60.2", + "linux-raw-sys", + "windows-sys 0.61.0", ] [[package]] name = "rustls" -version = "0.23.31" +version = "0.23.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" +checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" dependencies = [ "aws-lc-rs", "once_cell", @@ -5550,9 +5549,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.4" +version = "0.103.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" +checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb" dependencies = [ "aws-lc-rs", "ring", @@ -5572,7 +5571,7 @@ version = "17.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6614df0b6d4cfb20d1d5e295332921793ce499af3ebc011bf1e393380e1e492" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "cfg-if", "clipboard-win", "fd-lock", @@ -5605,11 +5604,11 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.0", ] [[package]] @@ -5674,11 +5673,11 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "security-framework" -version = "3.3.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" +checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "core-foundation", "core-foundation-sys", "libc", @@ -5687,9 +5686,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.14.0" +version = "2.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" dependencies = [ "core-foundation-sys", "libc", @@ -5713,9 +5712,9 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" -version = "1.0.223" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a505d71960adde88e293da5cb5eda57093379f64e61cf77bf0e6a63af07a7bac" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", "serde_derive", @@ -5723,27 +5722,28 @@ dependencies = [ [[package]] name = "serde_bytes" -version = "0.11.17" +version = "0.11.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8437fd221bde2d4ca316d61b90e337e9e702b3820b87d63caa9ba6c02bd06d96" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" dependencies = [ "serde", + "serde_core", ] [[package]] name = "serde_core" -version = "1.0.223" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20f57cbd357666aa7b3ac84a90b4ea328f1d4ddb6772b430caa5d9e1309bb9e9" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.223" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d428d07faf17e306e699ec1e91996e5a165ba5d6bce5b5155173e91a8a01a56" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -5811,15 +5811,15 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.14.0" +version = "3.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5" +checksum = "c522100790450cf78eeac1507263d0a350d4d5b30df0c8e1fe051a10c22b376e" dependencies = [ "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.11.3", + "indexmap 2.11.4", "schemars 0.9.0", "schemars 1.0.4", "serde", @@ -5831,11 +5831,11 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.14.0" +version = "3.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f" +checksum = "327ada00f7d64abaac1e55a6911e90cf665aa051b9a561c7006c157f4633135e" dependencies = [ - "darling 0.20.11", + "darling", "proc-macro2", "quote", "syn 2.0.106", @@ -5847,7 +5847,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.11.3", + "indexmap 2.11.4", "itoa", "ryu", "serde", @@ -6040,12 +6040,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stringprep" version = "0.1.5" @@ -6110,12 +6104,31 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.106", +] + [[package]] name = "strum_macros" version = "0.27.2" @@ -6146,12 +6159,12 @@ checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" dependencies = [ "heck 0.5.0", "pbjson", - "pbjson-build", + "pbjson-build 0.7.0", "pbjson-types", "prettyplease", - "prost", - "prost-build", - "prost-types", + "prost 0.13.5", + "prost-build 0.13.5", + "prost-types 0.13.5", "protobuf-src", "regress", "schemars 0.8.22", @@ -6214,9 +6227,9 @@ dependencies = [ [[package]] name = "sysinfo" -version = "0.37.0" +version = "0.37.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07cec4dc2d2e357ca1e610cfb07de2fa7a10fc3e9fe89f72545f3d244ea87753" +checksum = "3bddd368fda2f82ead69c03d46d351987cfa0c2a57abfa37a017f3aa3e9bf69a" dependencies = [ "libc", "memchr", @@ -6234,21 +6247,21 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "tempfile" -version = "3.22.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84fa4d11fadde498443cca10fd3ac23c951f0dc59e080e9f4b93d4df4e4eea53" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", - "rustix 1.0.8", - "windows-sys 0.60.2", + "rustix", + "windows-sys 0.61.0", ] [[package]] @@ -6311,18 +6324,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", @@ -6351,9 +6364,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.41" +version = "0.3.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" dependencies = [ "deranged", "itoa", @@ -6366,15 +6379,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" [[package]] name = "time-macros" -version = "0.2.22" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" dependencies = [ "num-conv", "time-core", @@ -6411,9 +6424,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -6457,9 +6470,9 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c95d533c83082bb6490e0189acaa0bbeef9084e60471b696ca6988cd0541fb0" +checksum = "a156efe7fff213168257853e1dfde202eed5f487522cbbbf7d219941d753d853" dependencies = [ "async-trait", "byteorder", @@ -6470,12 +6483,12 @@ dependencies = [ "log", "parking_lot", "percent-encoding", - "phf 0.11.3", + "phf 0.13.1", "pin-project-lite", "postgres-protocol", "postgres-types", "rand 0.9.2", - "socket2 0.5.10", + "socket2 0.6.0", "tokio", "tokio-util", "whoami", @@ -6483,9 +6496,9 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.2" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +checksum = "05f63835928ca123f1bef57abbcd23bb2ba0ac9ae1235f1e65bda0d06e7786bd" dependencies = [ "rustls", "tokio", @@ -6532,18 +6545,31 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.11" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +checksum = "32f1085dec27c2b6632b04c80b3bb1b4300d6495d1e129693bdda7d91e72eec1" +dependencies = [ + "serde_core", +] [[package]] name = "toml_edit" -version = "0.22.27" +version = "0.23.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +checksum = "f3effe7c0e86fdff4f69cdd2ccc1b96f933e24811c5441d44904e8683e27184b" dependencies = [ - "indexmap 2.11.3", + "indexmap 2.11.4", "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cf893c33be71572e0e9aa6dd15e6677937abd686b066eac3f8cd3531688a627" +dependencies = [ "winnow", ] @@ -6566,7 +6592,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.5", "socket2 0.5.10", "tokio", "tokio-stream", @@ -6584,7 +6610,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.11.3", + "indexmap 2.11.4", "pin-project-lite", "slab", "sync_wrapper", @@ -6601,7 +6627,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.1", + "bitflags 2.9.4", "bytes", "futures-util", "http 1.3.1", @@ -6705,9 +6731,9 @@ checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" [[package]] name = "twox-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" [[package]] name = "typed-arena" @@ -6723,15 +6749,15 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "typewit" -version = "1.12.1" +version = "1.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97e72ba082eeb9da9dc68ff5a2bf727ef6ce362556e8d29ec1aed3bd05e7d86a" +checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" [[package]] name = "typify" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c6c647a34e851cf0260ccc14687f17cdcb8302ff1a8a687a24b97ca0f82406f" +checksum = "7144144e97e987c94758a3017c920a027feac0799df325d6df4fc8f08d02068e" dependencies = [ "typify-impl", "typify-macro", @@ -6739,9 +6765,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "741b7f1e2e1338c0bee5ad5a7d3a9bbd4e24c33765c08b7691810e68d879365d" +checksum = "062879d46aa4c9dfe0d33b035bbaf512da192131645d05deacb7033ec8581a09" dependencies = [ "heck 0.5.0", "log", @@ -6759,9 +6785,9 @@ dependencies = [ [[package]] name = "typify-macro" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7560adf816a1e8dad7c63d8845ef6e31e673e39eab310d225636779230cbedeb" +checksum = "9708a3ceb6660ba3f8d2b8f0567e7d4b8b198e2b94d093b8a6077a751425de9e" dependencies = [ "proc-macro2", "quote", @@ -6782,9 +6808,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" [[package]] name = "unicode-normalization" @@ -6930,11 +6956,20 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasi" -version = "0.14.2+wasi-0.2.4" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen-rt", + "wit-bindgen", ] [[package]] @@ -6945,9 +6980,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.101" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e14915cadd45b529bb8d1f343c4ed0ac1de926144b746e2710f9cd05df6603b" +checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" dependencies = [ "cfg-if", "once_cell", @@ -6958,9 +6993,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.101" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28d1ba982ca7923fd01448d5c30c6864d0a14109560296a162f80f305fb93bb" +checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" dependencies = [ "bumpalo", "log", @@ -6972,9 +7007,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.51" +version = "0.4.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca85039a9b469b38336411d6d6ced91f3fc87109a2a27b0c197663f5144dffe" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" dependencies = [ "cfg-if", "js-sys", @@ -6985,9 +7020,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.101" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c3d463ae3eff775b0c45df9da45d68837702ac35af998361e2c84e7c5ec1b0d" +checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6995,9 +7030,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.101" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bb4ce89b08211f923caf51d527662b75bdc9c9c7aab40f86dcb9fb85ac552aa" +checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", @@ -7008,18 +7043,18 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.101" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f143854a3b13752c6950862c906306adb27c7e839f7414cec8fea35beab624c1" +checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" dependencies = [ "unicode-ident", ] [[package]] name = "wasm-bindgen-test" -version = "0.3.51" +version = "0.3.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80cc7f8a4114fdaa0c58383caf973fc126cf004eba25c9dc639bccd3880d55ad" +checksum = "4e381134e148c1062f965a42ed1f5ee933eef2927c3f70d1812158f711d39865" dependencies = [ "js-sys", "minicov", @@ -7030,9 +7065,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.51" +version = "0.3.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5ada2ab788d46d4bda04c9d567702a79c8ced14f51f221646a16ed39d0e6a5d" +checksum = "b673bca3298fe582aeef8352330ecbad91849f85090805582400850f8270a2e8" dependencies = [ "proc-macro2", "quote", @@ -7054,9 +7089,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.78" +version = "0.3.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77e4b637749ff0d92b8fad63aa1f7cff3cbe125fd49c175cd6345e7272638b12" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" dependencies = [ "js-sys", "wasm-bindgen", @@ -7072,18 +7107,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix 0.38.44", -] - [[package]] name = "whoami" version = "1.6.1" @@ -7113,11 +7136,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.0", ] [[package]] @@ -7133,7 +7156,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893" dependencies = [ "windows-collections", - "windows-core", + "windows-core 0.61.2", "windows-future", "windows-link 0.1.3", "windows-numerics", @@ -7145,7 +7168,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8" dependencies = [ - "windows-core", + "windows-core 0.61.2", ] [[package]] @@ -7157,8 +7180,21 @@ dependencies = [ "windows-implement", "windows-interface", "windows-link 0.1.3", - "windows-result", - "windows-strings", + "windows-result 0.3.4", + "windows-strings 0.4.2", +] + +[[package]] +name = "windows-core" +version = "0.62.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57fe7168f7de578d2d8a05b07fd61870d2e73b4020e9f49aa00da8471723497c" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link 0.2.0", + "windows-result 0.4.0", + "windows-strings 0.5.0", ] [[package]] @@ -7167,7 +7203,7 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e" dependencies = [ - "windows-core", + "windows-core 0.61.2", "windows-link 0.1.3", "windows-threading", ] @@ -7212,7 +7248,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1" dependencies = [ - "windows-core", + "windows-core 0.61.2", "windows-link 0.1.3", ] @@ -7225,6 +7261,15 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-result" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows-strings" version = "0.4.2" @@ -7234,6 +7279,15 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-strings" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -7261,6 +7315,15 @@ dependencies = [ "windows-targets 0.53.3", ] +[[package]] +name = "windows-sys" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +dependencies = [ + "windows-link 0.2.0", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -7401,21 +7464,18 @@ checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] name = "winnow" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" dependencies = [ "memchr", ] [[package]] -name = "wit-bindgen-rt" -version = "0.39.0" +name = "wit-bindgen" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags 2.9.1", -] +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" @@ -7434,12 +7494,12 @@ dependencies = [ [[package]] name = "xattr" -version = "1.5.1" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af3a19837351dc82ba89f8a125e22a3c475f05aba604acc023d62b2739ae2909" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.0.8", + "rustix", ] [[package]] @@ -7489,18 +7549,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", @@ -7569,9 +7629,9 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" [[package]] name = "zstd" @@ -7593,9 +7653,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" +version = "2.0.16+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 92392a1991..b54a75e5d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,7 +76,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) -rust-version = "1.86.0" +rust-version = "1.87.0" # Define DataFusion version version = "50.0.0" @@ -90,19 +90,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.0.0", features = [ +arrow = { version = "56.2.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.0.0", default-features = false } -arrow-flight = { version = "56.0.0", features = [ +arrow-buffer = { version = "56.2.0", default-features = false } +arrow-flight = { version = "56.2.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.0.0", default-features = false, features = [ +arrow-ipc = { version = "56.2.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.0.0", default-features = false } -arrow-schema = { version = "56.0.0", default-features = false } +arrow-ord = { version = "56.2.0", default-features = false } +arrow-schema = { version = "56.2.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" @@ -152,16 +152,15 @@ futures = "0.3" half = { version = "2.6.0", default-features = false } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3" } -indexmap = "2.11.3" +indexmap = "2.11.4" itertools = "0.14" log = "^0.4" -object_store = { version = "0.12.3", default-features = false } +object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { version = "56.0.0", default-features = false, features = [ +parquet = { version = "56.2.0", default-features = false, features = [ "arrow", "async", "object_store", - "encryption", ] } pbjson = { version = "0.7.0" } pbjson-types = "0.7" @@ -180,13 +179,56 @@ testcontainers-modules = { version = "0.12" } tokio = { version = "1.47", features = ["macros", "rt", "sync"] } url = "2.5.7" +[workspace.lints.clippy] +# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) +large_futures = "warn" +used_underscore_binding = "warn" +or_fun_call = "warn" +unnecessary_lazy_evaluations = "warn" +uninlined_format_args = "warn" +inefficient_to_string = "warn" + +[workspace.lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = [ + 'cfg(datafusion_coop, values("tokio", "tokio_fallback", "per_stream"))', + "cfg(tarpaulin)", + "cfg(tarpaulin_include)", +] } +unused_qualifications = "deny" + +# -------------------- +# Compilation Profiles +# -------------------- +# A Cargo profile is a preset for the compiler/linker knobs that trade off: +# - Build time: how quickly code compiles and links +# - Runtime performance: how fast the resulting binaries execute +# - Binary size: how large the executables end up +# - Debuggability: how much debug information is preserved for debugging and profiling +# +# Profiles available: +# - dev: default debug build; fastest to compile, slowest to run, full debug info +# for everyday development. +# Run: cargo run +# - release: optimized build; slowest to compile, fastest to run, smallest +# binaries for public releases. +# Run: cargo run --release +# - release-nonlto: skips LTO, so it builds quicker while staying close to +# release performance. It is useful when developing performance optimizations. +# Run: cargo run --profile release-nonlto +# - profiling: inherits release optimizations but retains debug info to support +# profiling tools and flamegraphs. +# Run: cargo run --profile profiling +# - ci: derived from `dev` but disables incremental builds and strips dependency +# symbols to keep CI artifacts small and reproducible. +# Run: cargo run --profile ci +# +# If you want to optimize compilation, the `compile_profile` benchmark can be useful. +# See `benchmarks/README.md` for more details. [profile.release] codegen-units = 1 lto = true strip = true # Eliminate debug information to minimize binary size -# the release profile takes a long time to build so we can use this profile during development to save time -# cargo build --profile release-nonlto [profile.release-nonlto] codegen-units = 16 debug-assertions = false @@ -199,36 +241,20 @@ rpath = false strip = false # Retain debug info for flamegraphs [profile.ci] +debug = false inherits = "dev" incremental = false -# ci turns off debug info, etc. for dependencies to allow for smaller binaries making caching more effective +# This rule applies to every package except workspace members (dependencies +# such as `arrow` and `tokio`). It disables debug info and related features on +# dependencies so their binaries stay smaller, improving cache reuse. [profile.ci.package."*"] debug = false debug-assertions = false strip = "debuginfo" incremental = false -# release inherited profile keeping debug information and symbols -# for mem/cpu profiling [profile.profiling] inherits = "release" debug = true strip = false - -[workspace.lints.clippy] -# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) -large_futures = "warn" -used_underscore_binding = "warn" -or_fun_call = "warn" -unnecessary_lazy_evaluations = "warn" -uninlined_format_args = "warn" -inefficient_to_string = "warn" - -[workspace.lints.rust] -unexpected_cfgs = { level = "warn", check-cfg = [ - 'cfg(datafusion_coop, values("tokio", "tokio_fallback", "per_stream"))', - "cfg(tarpaulin)", - "cfg(tarpaulin_include)", -] } -unused_qualifications = "deny" diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 2225d99820..b3fd520814 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -48,7 +48,7 @@ object_store = { workspace = true } parquet = { workspace = true, default-features = true } rand = { workspace = true } regex.workspace = true -serde = { version = "1.0.219", features = ["derive"] } +serde = { version = "1.0.228", features = ["derive"] } serde_json = { workspace = true } snmalloc-rs = { version = "0.3", optional = true } structopt = { version = "0.3", default-features = false } diff --git a/benchmarks/README.md b/benchmarks/README.md index 872500ef84..8fed85fa02 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -87,6 +87,39 @@ To run for specific query, for example Q21 ./bench.sh run tpch10 21 ``` +## Compile profile benchmark + +Generate the data required for the compile profile helper (TPC-H SF=1): + +```shell +./bench.sh data compile_profile +``` + +Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `release-nonlto`): + +```shell +./bench.sh run compile_profile +``` + +Limit the run to a single profile: + +```shell +./bench.sh run compile_profile dev +``` + +Or specify a subset of profiles: + +```shell +./bench.sh run compile_profile dev release +``` + +You can also invoke the helper directly if you need to customise arguments further: + +```shell +./benchmarks/compile_profile.py --profiles dev release --data /path/to/tpch_sf1 +``` + + ## Benchmark with modified configurations ### Select join algorithm @@ -727,6 +760,20 @@ Different queries are included to test nested loop joins under various workloads ./bench.sh run nlj ``` +## Hash Join + +This benchmark focuses on the performance of queries with nested hash joins, minimizing other overheads such as scanning data sources or evaluating predicates. + +Several queries are included to test hash joins under various workloads. + +### Example Run + +```bash +# No need to generate data: this benchmark uses table function `range()` as the data source + +./bench.sh run hj +``` + ## Cancellation Test performance of cancelling queries. diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index b99ab01005..dbfd319dd9 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -125,6 +125,9 @@ imdb: Join Order Benchmark (JOB) using the IMDB dataset conver # Micro-Benchmarks (specific operators and features) cancellation: How long cancelling a query takes nlj: Benchmark for simple nested loop joins, testing various join scenarios +hj: Benchmark for simple hash joins, testing various join scenarios +compile_profile: Compile and execute TPC-H across selected Cargo profiles, reporting timing and binary size + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Supported Configuration (Environment Variables) @@ -304,6 +307,13 @@ main() { # nlj uses range() function, no data generation needed echo "NLJ benchmark does not require data generation" ;; + hj) + # hj uses range() function, no data generation needed + echo "HJ benchmark does not require data generation" + ;; + compile_profile) + data_tpch "1" + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for data generation" usage @@ -313,20 +323,32 @@ main() { run) # Parse positional parameters BENCHMARK=${ARG2:-"${BENCHMARK}"} + EXTRA_ARGS=("${POSITIONAL_ARGS[@]:2}") + PROFILE_ARGS=() + QUERY="" + QUERY_ARG="" + if [ "$BENCHMARK" = "compile_profile" ]; then + PROFILE_ARGS=("${EXTRA_ARGS[@]}") + else + QUERY=${EXTRA_ARGS[0]} + if [ -n "$QUERY" ]; then + QUERY_ARG="--query ${QUERY}" + fi + fi BRANCH_NAME=$(cd "${DATAFUSION_DIR}" && git rev-parse --abbrev-ref HEAD) BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"} RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"} - # Optional query filter to run specific query - QUERY=${ARG3} - QUERY_ARG=$([ -n "$QUERY" ] && echo "--query ${QUERY}" || echo "") - echo "***************************" echo "DataFusion Benchmark Script" echo "COMMAND: ${COMMAND}" echo "BENCHMARK: ${BENCHMARK}" - echo "QUERY: ${QUERY:-All}" + if [ "$BENCHMARK" = "compile_profile" ]; then + echo "PROFILES: ${PROFILE_ARGS[*]:-All}" + else + echo "QUERY: ${QUERY:-All}" + fi echo "DATAFUSION_DIR: ${DATAFUSION_DIR}" echo "BRANCH_NAME: ${BRANCH_NAME}" echo "DATA_DIR: ${DATA_DIR}" @@ -361,6 +383,7 @@ main() { run_imdb run_external_aggr run_nlj + run_hj ;; tpch) run_tpch "1" "parquet" @@ -468,6 +491,12 @@ main() { nlj) run_nlj ;; + hj) + run_hj + ;; + compile_profile) + run_compile_profile "${PROFILE_ARGS[@]}" + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for run" usage @@ -593,6 +622,20 @@ run_tpch_mem() { debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} } +# Runs the compile profile benchmark helper +run_compile_profile() { + local profiles=("$@") + local runner="${SCRIPT_DIR}/compile_profile.py" + local data_path="${DATA_DIR}/tpch_sf1" + + echo "Running compile profile benchmark..." + local cmd=(python3 "${runner}" --data "${data_path}") + if [ ${#profiles[@]} -gt 0 ]; then + cmd+=(--profiles "${profiles[@]}") + fi + debug_run "${cmd[@]}" +} + # Runs the cancellation benchmark run_cancellation() { RESULTS_FILE="${RESULTS_DIR}/cancellation.json" @@ -1103,6 +1146,14 @@ run_nlj() { debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} } +# Runs the hj benchmark +run_hj() { + RESULTS_FILE="${RESULTS_DIR}/hj.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running hj benchmark..." + debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} +} + compare_benchmarks() { BASE_RESULTS_DIR="${SCRIPT_DIR}/results" diff --git a/benchmarks/compile_profile.py b/benchmarks/compile_profile.py new file mode 100644 index 0000000000..ae51de9493 --- /dev/null +++ b/benchmarks/compile_profile.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Compile profile benchmark runner for DataFusion. + +Builds the `tpch` benchmark binary with several Cargo profiles (e.g. `--release` or `--profile ci`), runs the full TPC-H suite against the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile time, execution time, and resulting +binary size. + +See `benchmarks/README.md` for usages. +""" + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Iterable, NamedTuple + +REPO_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_DATA_DIR = REPO_ROOT / "benchmarks" / "data" / "tpch_sf1" +DEFAULT_ITERATIONS = 1 +DEFAULT_FORMAT = "parquet" +DEFAULT_PARTITIONS: int | None = None +TPCH_BINARY = "tpch.exe" if os.name == "nt" else "tpch" +PROFILE_TARGET_DIR = { + "dev": "debug", + "release": "release", + "ci": "ci", + "release-nonlto": "release-nonlto", +} + + +class ProfileResult(NamedTuple): + profile: str + compile_seconds: float + run_seconds: float + binary_bytes: int + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--profiles", + nargs="+", + default=list(PROFILE_TARGET_DIR.keys()), + help="Cargo profiles to test (default: dev release ci release-nonlto)", + ) + parser.add_argument( + "--data", + type=Path, + default=DEFAULT_DATA_DIR, + help="Path to TPCH dataset (default: benchmarks/data/tpch_sf1)", + ) + return parser.parse_args() + + +def timed_run(command: Iterable[str]) -> float: + start = time.perf_counter() + try: + subprocess.run(command, cwd=REPO_ROOT, check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"command failed: {' '.join(command)}") from exc + return time.perf_counter() - start + + +def cargo_build(profile: str) -> float: + if profile == "dev": + command = ["cargo", "build", "--bin", "tpch"] + else: + command = ["cargo", "build", "--profile", profile, "--bin", "tpch"] + return timed_run(command) + + +def cargo_clean(profile: str) -> None: + command = ["cargo", "clean", "--profile", profile] + try: + subprocess.run(command, cwd=REPO_ROOT, check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"failed to clean cargo artifacts for profile '{profile}'") from exc + + +def run_benchmark(profile: str, data_path: Path) -> float: + binary_dir = PROFILE_TARGET_DIR.get(profile) + if not binary_dir: + raise ValueError(f"unknown profile '{profile}'") + binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY + if not binary_path.exists(): + raise FileNotFoundError(f"compiled binary not found at {binary_path}") + + command = [ + str(binary_path), + "benchmark", + "datafusion", + "--iterations", + str(DEFAULT_ITERATIONS), + "--path", + str(data_path), + "--format", + DEFAULT_FORMAT, + ] + if DEFAULT_PARTITIONS is not None: + command.extend(["--partitions", str(DEFAULT_PARTITIONS)]) + env = os.environ.copy() + env.setdefault("RUST_LOG", "warn") + + start = time.perf_counter() + try: + subprocess.run(command, cwd=REPO_ROOT, env=env, check=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"benchmark failed for profile '{profile}'") from exc + return time.perf_counter() - start + + +def binary_size(profile: str) -> int: + binary_dir = PROFILE_TARGET_DIR[profile] + binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY + return binary_path.stat().st_size + + +def human_time(seconds: float) -> str: + return f"{seconds:6.2f}s" + + +def human_size(size: int) -> str: + value = float(size) + for unit in ("B", "KB", "MB", "GB", "TB"): + if value < 1024 or unit == "TB": + return f"{value:6.1f}{unit}" + value /= 1024 + return f"{value:6.1f}TB" + + +def main() -> None: + args = parse_args() + data_path = args.data.resolve() + if not data_path.exists(): + print(f"Data directory not found: {data_path}", file=sys.stderr) + sys.exit(1) + + results: list[ProfileResult] = [] + for profile in args.profiles: + print(f"\n=== Profile: {profile} ===") + print("Cleaning previous build artifacts...") + cargo_clean(profile) + compile_seconds = cargo_build(profile) + run_seconds = run_benchmark(profile, data_path) + size_bytes = binary_size(profile) + results.append(ProfileResult(profile, compile_seconds, run_seconds, size_bytes)) + + print("\nSummary") + header = f"{'Profile':<15}{'Compile':>12}{'Run':>12}{'Size':>12}" + print(header) + print("-" * len(header)) + for result in results: + print( + f"{result.profile:<15}{human_time(result.compile_seconds):>12}" + f"{human_time(result.run_seconds):>12}{human_size(result.binary_bytes):>12}" + ) + +if __name__ == "__main__": + main() diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index 88378492b7..816cae0e38 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -33,7 +33,9 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -use datafusion_benchmarks::{cancellation, clickbench, h2o, imdb, nlj, sort_tpch, tpch}; +use datafusion_benchmarks::{ + cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch, +}; #[derive(Debug, StructOpt)] #[structopt(about = "benchmark command")] @@ -41,6 +43,7 @@ enum Options { Cancellation(cancellation::RunOpt), Clickbench(clickbench::RunOpt), H2o(h2o::RunOpt), + HJ(hj::RunOpt), Imdb(imdb::RunOpt), Nlj(nlj::RunOpt), SortTpch(sort_tpch::RunOpt), @@ -57,6 +60,7 @@ pub async fn main() -> Result<()> { Options::Cancellation(opt) => opt.run().await, Options::Clickbench(opt) => opt.run().await, Options::H2o(opt) => opt.run().await, + Options::HJ(opt) => opt.run().await, Options::Imdb(opt) => Box::pin(opt.run()).await, Options::Nlj(opt) => opt.run().await, Options::SortTpch(opt) => opt.run().await, diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs new file mode 100644 index 0000000000..505b322745 --- /dev/null +++ b/benchmarks/src/hj.rs @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::{BenchmarkRun, CommonOpt, QueryResult}; +use datafusion::physical_plan::execute_stream; +use datafusion::{error::Result, prelude::SessionContext}; +use datafusion_common::instant::Instant; +use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError}; +use structopt::StructOpt; + +use futures::StreamExt; + +// TODO: Add existence joins + +/// Run the Hash Join benchmark +/// +/// This micro-benchmark focuses on the performance characteristics of Hash Joins. +/// It uses simple equality predicates to ensure a hash join is selected. +/// Where we vary selectivity, we do so with additional cheap predicates that +/// do not change the join key (so the physical operator remains HashJoin). +#[derive(Debug, StructOpt, Clone)] +#[structopt(verbatim_doc_comment)] +pub struct RunOpt { + /// Query number (between 1 and 12). If not specified, runs all queries + #[structopt(short, long)] + query: Option, + + /// Common options (iterations, batch size, target_partitions, etc.) + #[structopt(flatten)] + common: CommonOpt, + + /// If present, write results json here + #[structopt(parse(from_os_str), short = "o", long = "output")] + output_path: Option, +} + +/// Inline SQL queries for Hash Join benchmarks +/// +/// Each query's comment includes: +/// - Left row count × Right row count +/// - Join predicate selectivity (approximate output fraction). +/// - Q11 and Q12 selectivity is relative to cartesian product while the others are +/// relative to probe side. +const HASH_QUERIES: &[&str] = &[ + // Q1: INNER 10 x 10K | LOW ~0.1% + // equality on key + cheap filter to downselect + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 9000, 1000) AS t1(value) + JOIN range(10000) AS t2 + ON t1.value = t2.value; + "#, + // Q2: INNER 10 x 10K | LOW ~0.1% + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 9000, 1000) AS t1 + JOIN range(10000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 5 = 0 + "#, + // Q3: INNER 10K x 10K | HIGH ~90% + r#" + SELECT t1.value, t2.value + FROM range(10000) AS t1 + JOIN range(10000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 10 <> 0 + "#, + // Q4: INNER 30 x 30K | LOW ~0.1% + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 29000, 1000) AS t1 + JOIN range(30000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 5 = 0 + "#, + // Q5: INNER 10 x 200K | VERY LOW ~0.005% (small to large) + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 9000, 1000) AS t1 + JOIN range(200000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 1000 = 0 + "#, + // Q6: INNER 200K x 10 | VERY LOW ~0.005% (large to small) + r#" + SELECT t1.value, t2.value + FROM range(200000) AS t1 + JOIN generate_series(0, 9000, 1000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 1000 = 0 + "#, + // Q7: RIGHT OUTER 10 x 200K | LOW ~0.1% + // Outer join still uses HashJoin for equi-keys; the extra filter reduces matches + r#" + SELECT t1.value AS l, t2.value AS r + FROM generate_series(0, 9000, 1000) AS t1 + RIGHT JOIN range(200000) AS t2 + ON t1.value = t2.value + WHERE t2.value % 1000 = 0 + "#, + // Q8: LEFT OUTER 200K x 10 | LOW ~0.1% + r#" + SELECT t1.value AS l, t2.value AS r + FROM range(200000) AS t1 + LEFT JOIN generate_series(0, 9000, 1000) AS t2 + ON t1.value = t2.value + WHERE t1.value % 1000 = 0 + "#, + // Q9: FULL OUTER 30 x 30K | LOW ~0.1% + r#" + SELECT t1.value AS l, t2.value AS r + FROM generate_series(0, 29000, 1000) AS t1 + FULL JOIN range(30000) AS t2 + ON t1.value = t2.value + WHERE COALESCE(t1.value, t2.value) % 1000 = 0 + "#, + // Q10: FULL OUTER 30 x 30K | HIGH ~90% + r#" + SELECT t1.value AS l, t2.value AS r + FROM generate_series(0, 29000, 1000) AS t1 + FULL JOIN range(30000) AS t2 + ON t1.value = t2.value + WHERE COALESCE(t1.value, t2.value) % 10 <> 0 + "#, + // Q11: INNER 30 x 30K | MEDIUM ~50% | cheap predicate on parity + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 29000, 1000) AS t1 + INNER JOIN range(30000) AS t2 + ON (t1.value % 2) = (t2.value % 2) + "#, + // Q12: FULL OUTER 30 x 30K | MEDIUM ~50% | expression key + r#" + SELECT t1.value AS l, t2.value AS r + FROM generate_series(0, 29000, 1000) AS t1 + FULL JOIN range(30000) AS t2 + ON (t1.value % 2) = (t2.value % 2) + "#, + // Q13: INNER 30 x 30K | LOW 0.1% | modulo with adding values + r#" + SELECT t1.value, t2.value + FROM generate_series(0, 29000, 1000) AS t1 + INNER JOIN range(30000) AS t2 + ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 < 1) + "#, + // Q14: FULL OUTER 30 x 30K | ALL ~100% | modulo + r#" + SELECT t1.value AS l, t2.value AS r + FROM generate_series(0, 29000, 1000) AS t1 + FULL JOIN range(30000) AS t2 + ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 = 0) + "#, +]; + +impl RunOpt { + pub async fn run(self) -> Result<()> { + println!("Running Hash Join benchmarks with the following options: {self:#?}\n"); + + let query_range = match self.query { + Some(query_id) => { + if query_id >= 1 && query_id <= HASH_QUERIES.len() { + query_id..=query_id + } else { + return exec_err!( + "Query {query_id} not found. Available queries: 1 to {}", + HASH_QUERIES.len() + ); + } + } + None => 1..=HASH_QUERIES.len(), + }; + + let config = self.common.config()?; + let rt_builder = self.common.runtime_env_builder()?; + let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?); + + let mut benchmark_run = BenchmarkRun::new(); + + for query_id in query_range { + let query_index = query_id - 1; + let sql = HASH_QUERIES[query_index]; + + benchmark_run.start_new_case(&format!("Query {query_id}")); + let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await; + match query_run { + Ok(query_results) => { + for iter in query_results { + benchmark_run.write_iter(iter.elapsed, iter.row_count); + } + } + Err(e) => { + return Err(DataFusionError::Context( + format!("Hash Join benchmark Q{query_id} failed with error:"), + Box::new(e), + )); + } + } + } + + benchmark_run.maybe_write_json(self.output_path.as_ref())?; + Ok(()) + } + + /// Validates that the physical plan uses a HashJoin, then executes. + async fn benchmark_query( + &self, + sql: &str, + query_name: &str, + ctx: &SessionContext, + ) -> Result> { + let mut query_results = vec![]; + + // Build/validate plan + let df = ctx.sql(sql).await?; + let physical_plan = df.create_physical_plan().await?; + let plan_string = format!("{physical_plan:#?}"); + + if !plan_string.contains("HashJoinExec") { + return Err(exec_datafusion_err!( + "Query {query_name} does not use Hash Join. Physical plan: {plan_string}" + )); + } + + // Execute without buffering + for i in 0..self.common.iterations { + let start = Instant::now(); + let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?; + let elapsed = start.elapsed(); + + println!( + "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}" + ); + + query_results.push(QueryResult { elapsed, row_count }); + } + + Ok(query_results) + } + + /// Executes the SQL query and drops each batch to avoid result buffering. + async fn execute_sql_without_result_buffering( + sql: &str, + ctx: &SessionContext, + ) -> Result { + let mut row_count = 0; + + let df = ctx.sql(sql).await?; + let physical_plan = df.create_physical_plan().await?; + let mut stream = execute_stream(physical_plan, ctx.task_ctx())?; + + while let Some(batch) = stream.next().await { + row_count += batch?.num_rows(); + // Drop batches immediately to minimize memory pressure + } + + Ok(row_count) + } +} diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs index 90e0947f64..3d58d5f54d 100644 --- a/benchmarks/src/imdb/run.rs +++ b/benchmarks/src/imdb/run.rs @@ -570,7 +570,7 @@ mod tests { let plan = ctx.sql(&query).await?; let plan = plan.create_physical_plan().await?; let bytes = physical_plan_to_bytes(plan.clone())?; - let plan2 = physical_plan_from_bytes(&bytes, &ctx)?; + let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?; let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false)); let plan2_formatted = format!("{}", displayable(plan2.as_ref()).indent(false)); diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs index 5d982fad6f..07cffa5ae4 100644 --- a/benchmarks/src/lib.rs +++ b/benchmarks/src/lib.rs @@ -19,6 +19,7 @@ pub mod cancellation; pub mod clickbench; pub mod h2o; +pub mod hj; pub mod imdb; pub mod nlj; pub mod sort_tpch; diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 30ecb4d33b..b93bdf254a 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -424,7 +424,7 @@ mod tests { let plan = ctx.sql(&query).await?; let plan = plan.create_physical_plan().await?; let bytes = physical_plan_to_bytes(plan.clone())?; - let plan2 = physical_plan_from_bytes(&bytes, &ctx)?; + let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?; let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false)); let plan2_formatted = format!("{}", displayable(plan2.as_ref()).indent(false)); diff --git a/ci/scripts/license_header.sh b/ci/scripts/license_header.sh new file mode 100755 index 0000000000..5345728f9c --- /dev/null +++ b/ci/scripts/license_header.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Check Apache license header +set -ex +hawkeye check --config licenserc.toml diff --git a/datafusion-cli/CONTRIBUTING.md b/datafusion-cli/CONTRIBUTING.md index 7c19c8e8bf..8be656ec4e 100644 --- a/datafusion-cli/CONTRIBUTING.md +++ b/datafusion-cli/CONTRIBUTING.md @@ -21,10 +21,16 @@ ## Running Tests -Tests can be run using `cargo` +First check out test files with ```shell -cargo test +git submodule update --init +``` + +Then run all the tests with + +```shell +cargo test --all-targets ``` ## Running Storage Integration Tests diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index ca796b525f..b34aa77037 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -19,12 +19,15 @@ -# DataFusion Command-line Interface +# Apache DataFusion Command-line Interface -[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine. +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ + # Frequently Asked Questions ## Where can I find more information? diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index ab1726f226..a6b818c109 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -571,15 +571,15 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - assert_snapshot!(batches_to_string(&rbs),@r#" + assert_snapshot!(batches_to_string(&rbs),@r" +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ | alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true | + | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ - "#); + "); // increase the number of hits ctx.sql("select * from alltypes_plain") @@ -602,15 +602,15 @@ mod tests { let df = ctx.sql(sql).await?; let rbs = df.collect().await?; - assert_snapshot!(batches_to_string(&rbs),@r#" + assert_snapshot!(batches_to_string(&rbs),@r" +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ | alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true | + | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ - "#); + "); Ok(()) } diff --git a/datafusion-examples/examples/composed_extension_codec.rs b/datafusion-examples/examples/composed_extension_codec.rs index d3548167f1..57f2c37041 100644 --- a/datafusion-examples/examples/composed_extension_codec.rs +++ b/datafusion-examples/examples/composed_extension_codec.rs @@ -32,12 +32,11 @@ use std::any::Any; use std::fmt::Debug; -use std::ops::Deref; use std::sync::Arc; use datafusion::common::internal_err; use datafusion::common::Result; -use datafusion::logical_expr::registry::FunctionRegistry; +use datafusion::execution::TaskContext; use datafusion::physical_plan::{DisplayAs, ExecutionPlan}; use datafusion::prelude::SessionContext; use datafusion_proto::physical_plan::{ @@ -71,9 +70,8 @@ async fn main() { .expect("to proto"); // deserialize proto back to execution plan - let runtime = ctx.runtime_env(); let result_exec_plan: Arc = proto - .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec) + .try_into_physical_plan(&ctx.task_ctx(), &composed_codec) .expect("from proto"); // assert that the original and deserialized execution plans are equal @@ -124,7 +122,7 @@ impl ExecutionPlan for ParentExec { fn execute( &self, _partition: usize, - _context: Arc, + _context: Arc, ) -> Result { unreachable!() } @@ -139,7 +137,7 @@ impl PhysicalExtensionCodec for ParentPhysicalExtensionCodec { &self, buf: &[u8], inputs: &[Arc], - _registry: &dyn FunctionRegistry, + _ctx: &TaskContext, ) -> Result> { if buf == "ParentExec".as_bytes() { Ok(Arc::new(ParentExec { @@ -200,7 +198,7 @@ impl ExecutionPlan for ChildExec { fn execute( &self, _partition: usize, - _context: Arc, + _context: Arc, ) -> Result { unreachable!() } @@ -215,7 +213,7 @@ impl PhysicalExtensionCodec for ChildPhysicalExtensionCodec { &self, buf: &[u8], _inputs: &[Arc], - _registry: &dyn FunctionRegistry, + _ctx: &TaskContext, ) -> Result> { if buf == "ChildExec".as_bytes() { Ok(Arc::new(ChildExec {})) diff --git a/datafusion-examples/examples/custom_file_casts.rs b/datafusion-examples/examples/custom_file_casts.rs index e30ea1fb7e..65ca096820 100644 --- a/datafusion-examples/examples/custom_file_casts.rs +++ b/datafusion-examples/examples/custom_file_casts.rs @@ -183,14 +183,18 @@ impl PhysicalExprAdapter for CustomCastsPhysicalExprAdapter { // For example, [DataFusion Comet](https://github.com/apache/datafusion-comet) has a [custom cast kernel](https://github.com/apache/datafusion-comet/blob/b4ac876ab420ed403ac7fc8e1b29f42f1f442566/native/spark-expr/src/conversion_funcs/cast.rs#L133-L138). expr.transform(|expr| { if let Some(cast) = expr.as_any().downcast_ref::() { - let input_data_type = cast.expr().data_type(&self.physical_file_schema)?; + let input_data_type = + cast.expr().data_type(&self.physical_file_schema)?; let output_data_type = cast.data_type(&self.physical_file_schema)?; if !cast.is_bigger_cast(&input_data_type) { - return not_impl_err!("Unsupported CAST from {input_data_type:?} to {output_data_type:?}") + return not_impl_err!( + "Unsupported CAST from {input_data_type} to {output_data_type}" + ); } } Ok(Transformed::no(expr)) - }).data() + }) + .data() } fn with_partition_values( diff --git a/datafusion-examples/examples/date_time_functions.rs b/datafusion-examples/examples/date_time_functions.rs index dbe9970439..2628319ae3 100644 --- a/datafusion-examples/examples/date_time_functions.rs +++ b/datafusion-examples/examples/date_time_functions.rs @@ -492,14 +492,14 @@ async fn query_to_char() -> Result<()> { assert_batches_eq!( &[ - "+------------------------------+", - "| to_char(t.values,t.patterns) |", - "+------------------------------+", - "| 2020-09-01 |", - "| 2020:09:02 |", - "| 20200903 |", - "| 04-09-2020 |", - "+------------------------------+", + "+----------------------------------+", + "| date_format(t.values,t.patterns) |", + "+----------------------------------+", + "| 2020-09-01 |", + "| 2020:09:02 |", + "| 20200903 |", + "| 04-09-2020 |", + "+----------------------------------+", ], &result ); diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index ac17bd417c..56f960870e 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -519,7 +519,7 @@ fn type_coercion_demo() -> Result<()> { )?; let i8_array = Int8Array::from_iter_values(vec![0, 1, 2]); let batch = RecordBatch::try_new( - Arc::new(df_schema.as_arrow().to_owned()), + Arc::clone(df_schema.inner()), vec![Arc::new(i8_array) as _], )?; diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/flight_server.rs index cc5f43746d..58bfb7a341 100644 --- a/datafusion-examples/examples/flight/flight_server.rs +++ b/datafusion-examples/examples/flight/flight_server.rs @@ -98,7 +98,7 @@ impl FlightService for FlightServiceImpl { let df = ctx.sql(sql).await.map_err(to_tonic_err)?; // execute the query - let schema = df.schema().clone().into(); + let schema = Arc::clone(df.schema().inner()); let results = df.collect().await.map_err(to_tonic_err)?; if results.is_empty() { return Err(Status::internal("There were no results from ticket")); diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/flight_sql_server.rs index 5a573ed523..c35debec7d 100644 --- a/datafusion-examples/examples/flight/flight_sql_server.rs +++ b/datafusion-examples/examples/flight/flight_sql_server.rs @@ -395,10 +395,8 @@ impl FlightSqlService for FlightSqlServiceImpl { let plan_uuid = Uuid::new_v4().hyphenated().to_string(); self.statements.insert(plan_uuid.clone(), plan.clone()); - let plan_schema = plan.schema(); - - let arrow_schema = (&**plan_schema).into(); - let message = SchemaAsIpc::new(&arrow_schema, &IpcWriteOptions::default()) + let arrow_schema = plan.schema().as_arrow(); + let message = SchemaAsIpc::new(arrow_schema, &IpcWriteOptions::default()) .try_into() .map_err(|e| status!("Unable to serialize schema", e))?; let IpcMessage(schema_bytes) = message; diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index b88461e7eb..38b843dec5 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-catalog-listing" description = "datafusion-catalog-listing" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/catalog-listing/README.md b/datafusion/catalog-listing/README.md index c8d1cf13b4..81a7c7b1da 100644 --- a/datafusion/catalog-listing/README.md +++ b/datafusion/catalog-listing/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion catalog-listing +# Apache DataFusion Catalog Listing -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion with [ListingTable], an implementation of [TableProvider] based on files in a directory (either locally or on remote @@ -29,8 +29,8 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [listingtable]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html [tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 5edb1c4a68..a1db45654b 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-catalog" description = "datafusion-catalog" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/catalog/README.md b/datafusion/catalog/README.md index d4870e28f3..48c61b43c0 100644 --- a/datafusion/catalog/README.md +++ b/datafusion/catalog/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Catalog +# Apache DataFusion Catalog -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides catalog management functionality, including catalogs, schemas, and tables. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/catalog/src/information_schema.rs b/datafusion/catalog/src/information_schema.rs index 17dbc31d95..d733551f44 100644 --- a/datafusion/catalog/src/information_schema.rs +++ b/datafusion/catalog/src/information_schema.rs @@ -480,7 +480,7 @@ fn get_udwf_args_and_return_types( #[inline] fn remove_native_type_prefix(native_type: NativeType) -> String { - format!("{native_type:?}") + format!("{native_type}") } #[async_trait] @@ -827,8 +827,7 @@ impl InformationSchemaColumnsBuilder { self.is_nullables.append_value(nullable_str); // "System supplied type" --> Use debug format of the datatype - self.data_types - .append_value(format!("{:?}", field.data_type())); + self.data_types.append_value(field.data_type().to_string()); // "If data_type identifies a character or bit string type, the // declared maximum length; null for all other data types or diff --git a/datafusion/catalog/src/listing_schema.rs b/datafusion/catalog/src/listing_schema.rs index 2e4eac964b..7e19c1ecaa 100644 --- a/datafusion/catalog/src/listing_schema.rs +++ b/datafusion/catalog/src/listing_schema.rs @@ -136,6 +136,7 @@ impl ListingSchemaProvider { file_type: self.format.clone(), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], diff --git a/datafusion/catalog/src/stream.rs b/datafusion/catalog/src/stream.rs index 0fab9beba8..2d66ff4628 100644 --- a/datafusion/catalog/src/stream.rs +++ b/datafusion/catalog/src/stream.rs @@ -53,7 +53,7 @@ impl TableProviderFactory for StreamTableFactory { state: &dyn Session, cmd: &CreateExternalTable, ) -> Result> { - let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into()); + let schema: SchemaRef = Arc::clone(cmd.schema.inner()); let location = cmd.location.clone(); let encoding = cmd.file_type.parse()?; let header = if let Ok(opt) = cmd diff --git a/datafusion/catalog/src/view.rs b/datafusion/catalog/src/view.rs index 3bb7214399..89c6a4a224 100644 --- a/datafusion/catalog/src/view.rs +++ b/datafusion/catalog/src/view.rs @@ -51,7 +51,7 @@ impl ViewTable { /// Notes: the `LogicalPlan` is not validated or type coerced. If this is /// needed it should be done after calling this function. pub fn new(logical_plan: LogicalPlan, definition: Option) -> Self { - let table_schema = logical_plan.schema().as_ref().to_owned().into(); + let table_schema = Arc::clone(logical_plan.schema().inner()); Self { logical_plan, table_schema, diff --git a/datafusion/common-runtime/README.md b/datafusion/common-runtime/README.md index bd0d4954b8..ff44e6c3e2 100644 --- a/datafusion/common-runtime/README.md +++ b/datafusion/common-runtime/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Common Runtime +# Apache DataFusion Common Runtime -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides common utilities. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index d0e78e0712..f5e51cb236 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -66,7 +66,7 @@ half = { workspace = true } hashbrown = { workspace = true } hex = { workspace = true, optional = true } indexmap = { workspace = true } -libc = "0.2.175" +libc = "0.2.176" log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } diff --git a/datafusion/common/README.md b/datafusion/common/README.md index e4d6b77265..4948c8c581 100644 --- a/datafusion/common/README.md +++ b/datafusion/common/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Common +# Apache DataFusion Common -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides common data types and utilities. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs index 68b753a667..e6eda3c585 100644 --- a/datafusion/common/src/cast.rs +++ b/datafusion/common/src/cast.rs @@ -22,9 +22,10 @@ use crate::{downcast_value, Result}; use arrow::array::{ - BinaryViewArray, DurationMicrosecondArray, DurationMillisecondArray, - DurationNanosecondArray, DurationSecondArray, Float16Array, Int16Array, Int8Array, - LargeBinaryArray, LargeStringArray, StringViewArray, UInt16Array, + BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray, + DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array, + Int16Array, Int8Array, LargeBinaryArray, LargeStringArray, StringViewArray, + UInt16Array, }; use arrow::{ array::{ @@ -97,6 +98,16 @@ pub fn as_uint64_array(array: &dyn Array) -> Result<&UInt64Array> { Ok(downcast_value!(array, UInt64Array)) } +// Downcast Array to Decimal32Array +pub fn as_decimal32_array(array: &dyn Array) -> Result<&Decimal32Array> { + Ok(downcast_value!(array, Decimal32Array)) +} + +// Downcast Array to Decimal64Array +pub fn as_decimal64_array(array: &dyn Array) -> Result<&Decimal64Array> { + Ok(downcast_value!(array, Decimal64Array)) +} + // Downcast Array to Decimal128Array pub fn as_decimal128_array(array: &dyn Array) -> Result<&Decimal128Array> { Ok(downcast_value!(array, Decimal128Array)) @@ -302,7 +313,7 @@ pub fn as_fixed_size_list_array(array: &dyn Array) -> Result<&FixedSizeListArray Ok(downcast_value!(array, FixedSizeListArray)) } -// Downcast Array to FixedSizeListArray +// Downcast Array to FixedSizeBinaryArray pub fn as_fixed_size_binary_array(array: &dyn Array) -> Result<&FixedSizeBinaryArray> { Ok(downcast_value!(array, FixedSizeBinaryArray)) } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 68968e7296..adfb1a6efd 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -566,6 +566,14 @@ config_namespace! { /// (reading) Use any available bloom filters when reading parquet files pub bloom_filter_on_read: bool, default = true + /// (reading) The maximum predicate cache size, in bytes. When + /// `pushdown_filters` is enabled, sets the maximum memory used to cache + /// the results of predicate evaluation between filter evaluation and + /// output generation. Decreasing this value will reduce memory usage, + /// but may increase IO and CPU usage. None means use the default + /// parquet reader setting. 0 means no caching. + pub max_predicate_cache_size: Option, default = None + // The following options affect writing to parquet files // and map to parquet::file::properties::WriterProperties diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 48d2555031..b195b1d4a1 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -101,7 +101,7 @@ pub type DFSchemaRef = Arc; /// let df_schema = DFSchema::from_unqualified_fields(vec![ /// Field::new("c1", arrow::datatypes::DataType::Int32, false), /// ].into(),HashMap::new()).unwrap(); -/// let schema = Schema::from(df_schema); +/// let schema: &Schema = df_schema.as_arrow(); /// assert_eq!(schema.fields().len(), 1); /// ``` #[derive(Debug, Clone, PartialEq, Eq)] @@ -594,7 +594,7 @@ impl DFSchema { &self, arrow_schema: &Schema, ) -> Result<()> { - let self_arrow_schema: Schema = self.into(); + let self_arrow_schema = self.as_arrow(); self_arrow_schema .fields() .iter() @@ -669,8 +669,8 @@ impl DFSchema { )) { _plan_err!( - "Schema mismatch: Expected field '{}' with type {:?}, \ - but got '{}' with type {:?}.", + "Schema mismatch: Expected field '{}' with type {}, \ + but got '{}' with type {}.", f1.name(), f1.data_type(), f2.name(), @@ -747,7 +747,8 @@ impl DFSchema { } /// Returns true of two [`DataType`]s are semantically equal (same - /// name and type), ignoring both metadata and nullability, and decimal precision/scale. + /// name and type), ignoring both metadata and nullability, decimal precision/scale, + /// and timezone time units/timezones. /// /// request to upstream: pub fn datatype_is_semantically_equal(dt1: &DataType, dt2: &DataType) -> bool { @@ -798,6 +799,14 @@ impl DFSchema { .zip(iter2) .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_semantically_equal(f1, f2)) } + ( + DataType::Decimal32(_l_precision, _l_scale), + DataType::Decimal32(_r_precision, _r_scale), + ) => true, + ( + DataType::Decimal64(_l_precision, _l_scale), + DataType::Decimal64(_r_precision, _r_scale), + ) => true, ( DataType::Decimal128(_l_precision, _l_scale), DataType::Decimal128(_r_precision, _r_scale), @@ -806,6 +815,10 @@ impl DFSchema { DataType::Decimal256(_l_precision, _l_scale), DataType::Decimal256(_r_precision, _r_scale), ) => true, + ( + DataType::Timestamp(_l_time_unit, _l_timezone), + DataType::Timestamp(_r_time_unit, _r_timezone), + ) => true, _ => dt1 == dt2, } } @@ -1056,6 +1069,12 @@ fn format_simple_data_type(data_type: &DataType) -> String { DataType::Dictionary(_, value_type) => { format_simple_data_type(value_type.as_ref()) } + DataType::Decimal32(precision, scale) => { + format!("decimal32({precision}, {scale})") + } + DataType::Decimal64(precision, scale) => { + format!("decimal64({precision}, {scale})") + } DataType::Decimal128(precision, scale) => { format!("decimal128({precision}, {scale})") } @@ -1063,23 +1082,7 @@ fn format_simple_data_type(data_type: &DataType) -> String { format!("decimal256({precision}, {scale})") } DataType::Null => "null".to_string(), - _ => format!("{data_type:?}").to_lowercase(), - } -} - -impl From for Schema { - /// Convert DFSchema into a Schema - fn from(df_schema: DFSchema) -> Self { - let fields: Fields = df_schema.inner.fields.clone(); - Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) - } -} - -impl From<&DFSchema> for Schema { - /// Convert DFSchema reference into a Schema - fn from(df_schema: &DFSchema) -> Self { - let fields: Fields = df_schema.inner.fields.clone(); - Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) + _ => format!("{data_type}").to_lowercase(), } } @@ -1115,17 +1118,15 @@ impl TryFrom for DFSchema { field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), }; - dfschema.check_names()?; + // Without checking names, because schema here may have duplicate field names. + // For example, Partial AggregateMode will generate duplicate field names from + // state_fields. + // See + // dfschema.check_names()?; Ok(dfschema) } } -impl From for SchemaRef { - fn from(df_schema: DFSchema) -> Self { - SchemaRef::new(df_schema.into()) - } -} - // Hashing refers to a subset of fields considered in PartialEq. impl Hash for DFSchema { fn hash(&self, state: &mut H) { @@ -1308,8 +1309,8 @@ impl SchemaExt for Schema { .try_for_each(|(f1, f2)| { if f1.name() != f2.name() || (!DFSchema::datatype_is_logically_equal(f1.data_type(), f2.data_type()) && !can_cast_types(f2.data_type(), f1.data_type())) { _plan_err!( - "Inserting query schema mismatch: Expected table field '{}' with type {:?}, \ - but got '{}' with type {:?}.", + "Inserting query schema mismatch: Expected table field '{}' with type {}, \ + but got '{}' with type {}.", f1.name(), f1.data_type(), f2.name(), @@ -1415,7 +1416,7 @@ mod tests { #[test] fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; - let arrow_schema: Schema = schema.into(); + let arrow_schema = schema.as_arrow(); let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"; assert_eq!(expected, arrow_schema.to_string()); @@ -1794,6 +1795,36 @@ mod tests { &DataType::Int16 )); + // Succeeds if decimal precision and scale are different + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Decimal32(1, 2), + &DataType::Decimal32(2, 1), + )); + + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Decimal64(1, 2), + &DataType::Decimal64(2, 1), + )); + + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Decimal128(1, 2), + &DataType::Decimal128(2, 1), + )); + + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Decimal256(1, 2), + &DataType::Decimal256(2, 1), + )); + + // Any two timestamp types should match + assert!(DFSchema::datatype_is_semantically_equal( + &DataType::Timestamp( + arrow::datatypes::TimeUnit::Microsecond, + Some("UTC".into()) + ), + &DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + )); + // Test lists // Succeeds if both have the same element type, disregards names and nullability @@ -2377,6 +2408,8 @@ mod tests { ), false, ), + Field::new("decimal32", DataType::Decimal32(9, 4), true), + Field::new("decimal64", DataType::Decimal64(9, 4), true), Field::new("decimal128", DataType::Decimal128(18, 4), true), Field::new("decimal256", DataType::Decimal256(38, 10), false), Field::new("date32", DataType::Date32, true), @@ -2408,6 +2441,8 @@ mod tests { |-- fixed_size_binary: fixed_size_binary (nullable = true) |-- fixed_size_list: fixed size list (nullable = false) | |-- item: int32 (nullable = true) + |-- decimal32: decimal32(9, 4) (nullable = true) + |-- decimal64: decimal64(9, 4) (nullable = true) |-- decimal128: decimal128(18, 4) (nullable = true) |-- decimal256: decimal256(38, 10) (nullable = false) |-- date32: date32 (nullable = true) diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index a48700a9ee..3977f2b489 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -208,6 +208,7 @@ impl ParquetOptions { binary_as_string: _, // not used for writer props coerce_int96: _, // not used for writer props skip_arrow_metadata: _, + max_predicate_cache_size: _, } = self; let mut builder = WriterProperties::builder() @@ -400,6 +401,10 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Resulti64, 'b','c' ignored, 'd' filled with nulls /// @@ -230,7 +230,7 @@ pub fn validate_struct_compatibility( target_field.data_type(), ) { return _plan_err!( - "Cannot cast struct field '{}' from type {:?} to type {:?}", + "Cannot cast struct field '{}' from type {} to type {}", target_field.name(), source_field.data_type(), target_field.data_type() diff --git a/datafusion/common/src/param_value.rs b/datafusion/common/src/param_value.rs index d2802c096d..7582cff56f 100644 --- a/datafusion/common/src/param_value.rs +++ b/datafusion/common/src/param_value.rs @@ -48,7 +48,7 @@ impl ParamValues { for (i, (param_type, value)) in iter.enumerate() { if *param_type != value.data_type() { return _plan_err!( - "Expected parameter of type {:?}, got {:?} at index {}", + "Expected parameter of type {}, got {:?} at index {}", param_type, value.data_type(), i diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c5e764272b..8c079056e2 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -35,13 +35,14 @@ use std::sync::Arc; use crate::cast::{ as_binary_array, as_binary_view_array, as_boolean_array, as_date32_array, - as_date64_array, as_decimal128_array, as_decimal256_array, as_dictionary_array, - as_duration_microsecond_array, as_duration_millisecond_array, - as_duration_nanosecond_array, as_duration_second_array, as_fixed_size_binary_array, - as_fixed_size_list_array, as_float16_array, as_float32_array, as_float64_array, - as_int16_array, as_int32_array, as_int64_array, as_int8_array, as_interval_dt_array, - as_interval_mdn_array, as_interval_ym_array, as_large_binary_array, - as_large_list_array, as_large_string_array, as_string_array, as_string_view_array, + as_date64_array, as_decimal128_array, as_decimal256_array, as_decimal32_array, + as_decimal64_array, as_dictionary_array, as_duration_microsecond_array, + as_duration_millisecond_array, as_duration_nanosecond_array, + as_duration_second_array, as_fixed_size_binary_array, as_fixed_size_list_array, + as_float16_array, as_float32_array, as_float64_array, as_int16_array, as_int32_array, + as_int64_array, as_int8_array, as_interval_dt_array, as_interval_mdn_array, + as_interval_ym_array, as_large_binary_array, as_large_list_array, + as_large_string_array, as_string_array, as_string_view_array, as_time32_millisecond_array, as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array, as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, @@ -56,13 +57,14 @@ use crate::{_internal_datafusion_err, arrow_datafusion_err}; use arrow::array::{ new_empty_array, new_null_array, Array, ArrayData, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Decimal256Array, DictionaryArray, - DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, - DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, Float16Array, - Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, - Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, - LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, MapArray, - MutableArrayData, PrimitiveArray, Scalar, StringArray, StringViewArray, StructArray, + Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + DictionaryArray, DurationMicrosecondArray, DurationMillisecondArray, + DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, + FixedSizeListArray, Float16Array, Float32Array, Float64Array, GenericListArray, + Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, + IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeListArray, + LargeStringArray, ListArray, MapArray, MutableArrayData, OffsetSizeTrait, + PrimitiveArray, Scalar, StringArray, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, @@ -75,12 +77,13 @@ use arrow::compute::kernels::numeric::{ }; use arrow::datatypes::{ i256, validate_decimal_precision_and_scale, ArrowDictionaryKeyType, ArrowNativeType, - ArrowTimestampType, DataType, Date32Type, Decimal128Type, Decimal256Type, Field, - Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTime, - IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, UnionFields, UnionMode, DECIMAL128_MAX_PRECISION, + ArrowTimestampType, DataType, Date32Type, Decimal128Type, Decimal256Type, + Decimal32Type, Decimal64Type, Field, Float32Type, Int16Type, Int32Type, Int64Type, + Int8Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano, + IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, UnionFields, + UnionMode, DECIMAL128_MAX_PRECISION, }; use arrow::util::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use cache::{get_or_create_cached_key_array, get_or_create_cached_null_array}; @@ -231,6 +234,10 @@ pub enum ScalarValue { Float32(Option), /// 64bit float Float64(Option), + /// 32bit decimal, using the i32 to represent the decimal, precision scale + Decimal32(Option, u8, i8), + /// 64bit decimal, using the i64 to represent the decimal, precision scale + Decimal64(Option, u8, i8), /// 128bit decimal, using the i128 to represent the decimal, precision scale Decimal128(Option, u8, i8), /// 256bit decimal, using the i256 to represent the decimal, precision scale @@ -340,6 +347,14 @@ impl PartialEq for ScalarValue { // any newly added enum variant will require editing this list // or else face a compile error match (self, other) { + (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => { + v1.eq(v2) && p1.eq(p2) && s1.eq(s2) + } + (Decimal32(_, _, _), _) => false, + (Decimal64(v1, p1, s1), Decimal64(v2, p2, s2)) => { + v1.eq(v2) && p1.eq(p2) && s1.eq(s2) + } + (Decimal64(_, _, _), _) => false, (Decimal128(v1, p1, s1), Decimal128(v2, p2, s2)) => { v1.eq(v2) && p1.eq(p2) && s1.eq(s2) } @@ -459,6 +474,24 @@ impl PartialOrd for ScalarValue { // any newly added enum variant will require editing this list // or else face a compile error match (self, other) { + (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => { + if p1.eq(p2) && s1.eq(s2) { + v1.partial_cmp(v2) + } else { + // Two decimal values can be compared if they have the same precision and scale. + None + } + } + (Decimal32(_, _, _), _) => None, + (Decimal64(v1, p1, s1), Decimal64(v2, p2, s2)) => { + if p1.eq(p2) && s1.eq(s2) { + v1.partial_cmp(v2) + } else { + // Two decimal values can be compared if they have the same precision and scale. + None + } + } + (Decimal64(_, _, _), _) => None, (Decimal128(v1, p1, s1), Decimal128(v2, p2, s2)) => { if p1.eq(p2) && s1.eq(s2) { v1.partial_cmp(v2) @@ -760,6 +793,16 @@ impl Hash for ScalarValue { fn hash(&self, state: &mut H) { use ScalarValue::*; match self { + Decimal32(v, p, s) => { + v.hash(state); + p.hash(state); + s.hash(state) + } + Decimal64(v, p, s) => { + v.hash(state); + p.hash(state); + s.hash(state) + } Decimal128(v, p, s) => { v.hash(state); p.hash(state); @@ -834,8 +877,9 @@ impl Hash for ScalarValue { } fn hash_nested_array(arr: ArrayRef, state: &mut H) { - let arrays = vec![arr.to_owned()]; - let hashes_buffer = &mut vec![0; arr.len()]; + let len = arr.len(); + let arrays = vec![arr]; + let hashes_buffer = &mut vec![0; len]; let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0); let hashes = create_hashes(&arrays, &random_state, hashes_buffer).unwrap(); // Hash back to std::hash::Hasher @@ -1045,6 +1089,12 @@ impl ScalarValue { DataType::UInt16 => ScalarValue::UInt16(None), DataType::UInt32 => ScalarValue::UInt32(None), DataType::UInt64 => ScalarValue::UInt64(None), + DataType::Decimal32(precision, scale) => { + ScalarValue::Decimal32(None, *precision, *scale) + } + DataType::Decimal64(precision, scale) => { + ScalarValue::Decimal64(None, *precision, *scale) + } DataType::Decimal128(precision, scale) => { ScalarValue::Decimal128(None, *precision, *scale) } @@ -1137,7 +1187,7 @@ impl ScalarValue { DataType::Null => ScalarValue::Null, _ => { return _not_impl_err!( - "Can't create a null scalar from data_type \"{data_type:?}\"" + "Can't create a null scalar from data_type \"{data_type}\"" ); } }) @@ -1193,7 +1243,7 @@ impl ScalarValue { match datatype { DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::PI)), DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::PI)), - _ => _internal_err!("PI is not supported for data type: {:?}", datatype), + _ => _internal_err!("PI is not supported for data type: {}", datatype), } } @@ -1203,7 +1253,7 @@ impl ScalarValue { DataType::Float32 => Ok(ScalarValue::from(consts::PI_UPPER_F32)), DataType::Float64 => Ok(ScalarValue::from(consts::PI_UPPER_F64)), _ => { - _internal_err!("PI_UPPER is not supported for data type: {:?}", datatype) + _internal_err!("PI_UPPER is not supported for data type: {}", datatype) } } } @@ -1214,7 +1264,7 @@ impl ScalarValue { DataType::Float32 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F32)), DataType::Float64 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F64)), _ => { - _internal_err!("-PI_LOWER is not supported for data type: {:?}", datatype) + _internal_err!("-PI_LOWER is not supported for data type: {}", datatype) } } } @@ -1225,10 +1275,7 @@ impl ScalarValue { DataType::Float32 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F32)), DataType::Float64 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F64)), _ => { - _internal_err!( - "PI_UPPER/2 is not supported for data type: {:?}", - datatype - ) + _internal_err!("PI_UPPER/2 is not supported for data type: {}", datatype) } } } @@ -1243,10 +1290,7 @@ impl ScalarValue { Ok(ScalarValue::from(consts::NEGATIVE_FRAC_PI_2_LOWER_F64)) } _ => { - _internal_err!( - "-PI/2_LOWER is not supported for data type: {:?}", - datatype - ) + _internal_err!("-PI/2_LOWER is not supported for data type: {}", datatype) } } } @@ -1256,7 +1300,7 @@ impl ScalarValue { match datatype { DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::PI)), DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::PI)), - _ => _internal_err!("-PI is not supported for data type: {:?}", datatype), + _ => _internal_err!("-PI is not supported for data type: {}", datatype), } } @@ -1265,7 +1309,7 @@ impl ScalarValue { match datatype { DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::FRAC_PI_2)), DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::FRAC_PI_2)), - _ => _internal_err!("PI/2 is not supported for data type: {:?}", datatype), + _ => _internal_err!("PI/2 is not supported for data type: {}", datatype), } } @@ -1274,7 +1318,7 @@ impl ScalarValue { match datatype { DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::FRAC_PI_2)), DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::FRAC_PI_2)), - _ => _internal_err!("-PI/2 is not supported for data type: {:?}", datatype), + _ => _internal_err!("-PI/2 is not supported for data type: {}", datatype), } } @@ -1284,7 +1328,7 @@ impl ScalarValue { DataType::Float32 => Ok(ScalarValue::from(f32::INFINITY)), DataType::Float64 => Ok(ScalarValue::from(f64::INFINITY)), _ => { - _internal_err!("Infinity is not supported for data type: {:?}", datatype) + _internal_err!("Infinity is not supported for data type: {}", datatype) } } } @@ -1296,7 +1340,7 @@ impl ScalarValue { DataType::Float64 => Ok(ScalarValue::from(f64::NEG_INFINITY)), _ => { _internal_err!( - "Negative Infinity is not supported for data type: {:?}", + "Negative Infinity is not supported for data type: {}", datatype ) } @@ -1369,7 +1413,7 @@ impl ScalarValue { DataType::Date64 => ScalarValue::Date64(Some(0)), _ => { return _not_impl_err!( - "Can't create a zero scalar from data_type \"{datatype:?}\"" + "Can't create a zero scalar from data_type \"{datatype}\"" ); } }) @@ -1507,7 +1551,7 @@ impl ScalarValue { // Unsupported types for now _ => { _not_impl_err!( - "Default value for data_type \"{datatype:?}\" is not implemented yet" + "Default value for data_type \"{datatype}\" is not implemented yet" ) } } @@ -1527,6 +1571,34 @@ impl ScalarValue { DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(1.0))), DataType::Float32 => ScalarValue::Float32(Some(1.0)), DataType::Float64 => ScalarValue::Float64(Some(1.0)), + DataType::Decimal32(precision, scale) => { + validate_decimal_precision_and_scale::( + *precision, *scale, + )?; + if *scale < 0 { + return _internal_err!("Negative scale is not supported"); + } + match 10_i32.checked_pow(*scale as u32) { + Some(value) => { + ScalarValue::Decimal32(Some(value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } + DataType::Decimal64(precision, scale) => { + validate_decimal_precision_and_scale::( + *precision, *scale, + )?; + if *scale < 0 { + return _internal_err!("Negative scale is not supported"); + } + match i64::from(10).checked_pow(*scale as u32) { + Some(value) => { + ScalarValue::Decimal64(Some(value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } DataType::Decimal128(precision, scale) => { validate_decimal_precision_and_scale::( *precision, *scale, @@ -1557,7 +1629,7 @@ impl ScalarValue { } _ => { return _not_impl_err!( - "Can't create an one scalar from data_type \"{datatype:?}\"" + "Can't create an one scalar from data_type \"{datatype}\"" ); } }) @@ -1573,6 +1645,34 @@ impl ScalarValue { DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(-1.0))), DataType::Float32 => ScalarValue::Float32(Some(-1.0)), DataType::Float64 => ScalarValue::Float64(Some(-1.0)), + DataType::Decimal32(precision, scale) => { + validate_decimal_precision_and_scale::( + *precision, *scale, + )?; + if *scale < 0 { + return _internal_err!("Negative scale is not supported"); + } + match 10_i32.checked_pow(*scale as u32) { + Some(value) => { + ScalarValue::Decimal32(Some(-value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } + DataType::Decimal64(precision, scale) => { + validate_decimal_precision_and_scale::( + *precision, *scale, + )?; + if *scale < 0 { + return _internal_err!("Negative scale is not supported"); + } + match i64::from(10).checked_pow(*scale as u32) { + Some(value) => { + ScalarValue::Decimal64(Some(-value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } DataType::Decimal128(precision, scale) => { validate_decimal_precision_and_scale::( *precision, *scale, @@ -1603,7 +1703,7 @@ impl ScalarValue { } _ => { return _not_impl_err!( - "Can't create a negative one scalar from data_type \"{datatype:?}\"" + "Can't create a negative one scalar from data_type \"{datatype}\"" ); } }) @@ -1622,6 +1722,38 @@ impl ScalarValue { DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(10.0))), DataType::Float32 => ScalarValue::Float32(Some(10.0)), DataType::Float64 => ScalarValue::Float64(Some(10.0)), + DataType::Decimal32(precision, scale) => { + if let Err(err) = validate_decimal_precision_and_scale::( + *precision, *scale, + ) { + return _internal_err!("Invalid precision and scale {err}"); + } + if *scale <= 0 { + return _internal_err!("Negative scale is not supported"); + } + match 10_i32.checked_pow((*scale + 1) as u32) { + Some(value) => { + ScalarValue::Decimal32(Some(value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } + DataType::Decimal64(precision, scale) => { + if let Err(err) = validate_decimal_precision_and_scale::( + *precision, *scale, + ) { + return _internal_err!("Invalid precision and scale {err}"); + } + if *scale <= 0 { + return _internal_err!("Negative scale is not supported"); + } + match i64::from(10).checked_pow((*scale + 1) as u32) { + Some(value) => { + ScalarValue::Decimal64(Some(value), *precision, *scale) + } + None => return _internal_err!("Unsupported scale {scale}"), + } + } DataType::Decimal128(precision, scale) => { if let Err(err) = validate_decimal_precision_and_scale::( *precision, *scale, @@ -1656,7 +1788,7 @@ impl ScalarValue { } _ => { return _not_impl_err!( - "Can't create a ten scalar from data_type \"{datatype:?}\"" + "Can't create a ten scalar from data_type \"{datatype}\"" ); } }) @@ -1674,6 +1806,12 @@ impl ScalarValue { ScalarValue::Int16(_) => DataType::Int16, ScalarValue::Int32(_) => DataType::Int32, ScalarValue::Int64(_) => DataType::Int64, + ScalarValue::Decimal32(_, precision, scale) => { + DataType::Decimal32(*precision, *scale) + } + ScalarValue::Decimal64(_, precision, scale) => { + DataType::Decimal64(*precision, *scale) + } ScalarValue::Decimal128(_, precision, scale) => { DataType::Decimal128(*precision, *scale) } @@ -1796,6 +1934,24 @@ impl ScalarValue { ); Ok(ScalarValue::IntervalMonthDayNano(Some(val))) } + ScalarValue::Decimal32(Some(v), precision, scale) => { + Ok(ScalarValue::Decimal32( + Some(neg_checked_with_ctx(*v, || { + format!("In negation of Decimal32({v}, {precision}, {scale})") + })?), + *precision, + *scale, + )) + } + ScalarValue::Decimal64(Some(v), precision, scale) => { + Ok(ScalarValue::Decimal64( + Some(neg_checked_with_ctx(*v, || { + format!("In negation of Decimal64({v}, {precision}, {scale})") + })?), + *precision, + *scale, + )) + } ScalarValue::Decimal128(Some(v), precision, scale) => { Ok(ScalarValue::Decimal128( Some(neg_checked_with_ctx(*v, || { @@ -1947,6 +2103,8 @@ impl ScalarValue { ScalarValue::Float16(v) => v.is_none(), ScalarValue::Float32(v) => v.is_none(), ScalarValue::Float64(v) => v.is_none(), + ScalarValue::Decimal32(v, _, _) => v.is_none(), + ScalarValue::Decimal64(v, _, _) => v.is_none(), ScalarValue::Decimal128(v, _, _) => v.is_none(), ScalarValue::Decimal256(v, _, _) => v.is_none(), ScalarValue::Int8(v) => v.is_none(), @@ -2202,19 +2360,19 @@ impl ScalarValue { } let array: ArrayRef = match &data_type { - DataType::Decimal32(_precision, _scale) => { - return _not_impl_err!( - "Decimal32 not supported in ScalarValue::iter_to_array" - ); + DataType::Decimal32(precision, scale) => { + let decimal_array = + ScalarValue::iter_to_decimal32_array(scalars, *precision, *scale)?; + Arc::new(decimal_array) } - DataType::Decimal64(_precision, _scale) => { - return _not_impl_err!( - "Decimal64 not supported in ScalarValue::iter_to_array" - ); + DataType::Decimal64(precision, scale) => { + let decimal_array = + ScalarValue::iter_to_decimal64_array(scalars, *precision, *scale)?; + Arc::new(decimal_array) } DataType::Decimal128(precision, scale) => { let decimal_array = - ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?; + ScalarValue::iter_to_decimal128_array(scalars, *precision, *scale)?; Arc::new(decimal_array) } DataType::Decimal256(precision, scale) => { @@ -2364,7 +2522,7 @@ impl ScalarValue { DataType::UInt16 => dict_from_values::(values)?, DataType::UInt32 => dict_from_values::(values)?, DataType::UInt64 => dict_from_values::(values)?, - _ => unreachable!("Invalid dictionary keys type: {:?}", key_type), + _ => unreachable!("Invalid dictionary keys type: {}", key_type), } } DataType::FixedSizeBinary(size) => { @@ -2375,7 +2533,7 @@ impl ScalarValue { } else { _exec_err!( "Inconsistent types in ScalarValue::iter_to_array. \ - Expected {data_type:?}, got {sv:?}" + Expected {data_type}, got {sv:?}" ) } }) @@ -2423,7 +2581,43 @@ impl ScalarValue { Ok(new_null_array(&DataType::Null, length)) } - fn iter_to_decimal_array( + fn iter_to_decimal32_array( + scalars: impl IntoIterator, + precision: u8, + scale: i8, + ) -> Result { + let array = scalars + .into_iter() + .map(|element: ScalarValue| match element { + ScalarValue::Decimal32(v1, _, _) => Ok(v1), + s => { + _internal_err!("Expected ScalarValue::Null element. Received {s:?}") + } + }) + .collect::>()? + .with_precision_and_scale(precision, scale)?; + Ok(array) + } + + fn iter_to_decimal64_array( + scalars: impl IntoIterator, + precision: u8, + scale: i8, + ) -> Result { + let array = scalars + .into_iter() + .map(|element: ScalarValue| match element { + ScalarValue::Decimal64(v1, _, _) => Ok(v1), + s => { + _internal_err!("Expected ScalarValue::Null element. Received {s:?}") + } + }) + .collect::>()? + .with_precision_and_scale(precision, scale)?; + Ok(array) + } + + fn iter_to_decimal128_array( scalars: impl IntoIterator, precision: u8, scale: i8, @@ -2461,7 +2655,43 @@ impl ScalarValue { Ok(array) } - fn build_decimal_array( + fn build_decimal32_array( + value: Option, + precision: u8, + scale: i8, + size: usize, + ) -> Result { + Ok(match value { + Some(val) => Decimal32Array::from(vec![val; size]) + .with_precision_and_scale(precision, scale)?, + None => { + let mut builder = Decimal32Array::builder(size) + .with_precision_and_scale(precision, scale)?; + builder.append_nulls(size); + builder.finish() + } + }) + } + + fn build_decimal64_array( + value: Option, + precision: u8, + scale: i8, + size: usize, + ) -> Result { + Ok(match value { + Some(val) => Decimal64Array::from(vec![val; size]) + .with_precision_and_scale(precision, scale)?, + None => { + let mut builder = Decimal64Array::builder(size) + .with_precision_and_scale(precision, scale)?; + builder.append_nulls(size); + builder.finish() + } + }) + } + + fn build_decimal128_array( value: Option, precision: u8, scale: i8, @@ -2640,8 +2870,14 @@ impl ScalarValue { /// - a `Dictionary` that fails be converted to a dictionary array of size pub fn to_array_of_size(&self, size: usize) -> Result { Ok(match self { + ScalarValue::Decimal32(e, precision, scale) => Arc::new( + ScalarValue::build_decimal32_array(*e, *precision, *scale, size)?, + ), + ScalarValue::Decimal64(e, precision, scale) => Arc::new( + ScalarValue::build_decimal64_array(*e, *precision, *scale, size)?, + ), ScalarValue::Decimal128(e, precision, scale) => Arc::new( - ScalarValue::build_decimal_array(*e, *precision, *scale, size)?, + ScalarValue::build_decimal128_array(*e, *precision, *scale, size)?, ), ScalarValue::Decimal256(e, precision, scale) => Arc::new( ScalarValue::build_decimal256_array(*e, *precision, *scale, size)?, @@ -2937,7 +3173,7 @@ impl ScalarValue { DataType::UInt16 => dict_from_scalar::(v, size)?, DataType::UInt32 => dict_from_scalar::(v, size)?, DataType::UInt64 => dict_from_scalar::(v, size)?, - _ => unreachable!("Invalid dictionary keys type: {:?}", key_type), + _ => unreachable!("Invalid dictionary keys type: {}", key_type), } } ScalarValue::Null => get_or_create_cached_null_array(size), @@ -2951,6 +3187,24 @@ impl ScalarValue { scale: i8, ) -> Result { match array.data_type() { + DataType::Decimal32(_, _) => { + let array = as_decimal32_array(array)?; + if array.is_null(index) { + Ok(ScalarValue::Decimal32(None, precision, scale)) + } else { + let value = array.value(index); + Ok(ScalarValue::Decimal32(Some(value), precision, scale)) + } + } + DataType::Decimal64(_, _) => { + let array = as_decimal64_array(array)?; + if array.is_null(index) { + Ok(ScalarValue::Decimal64(None, precision, scale)) + } else { + let value = array.value(index); + Ok(ScalarValue::Decimal64(Some(value), precision, scale)) + } + } DataType::Decimal128(_, _) => { let array = as_decimal128_array(array)?; if array.is_null(index) { @@ -2969,7 +3223,9 @@ impl ScalarValue { Ok(ScalarValue::Decimal256(Some(value), precision, scale)) } } - _ => _internal_err!("Unsupported decimal type"), + other => { + unreachable!("Invalid type isn't decimal: {other:?}") + } } } @@ -3050,17 +3306,30 @@ impl ScalarValue { /// assert_eq!(scalar_vec, expected); /// ``` pub fn convert_array_to_scalar_vec(array: &dyn Array) -> Result>> { - let mut scalars = Vec::with_capacity(array.len()); - - for index in 0..array.len() { - let nested_array = array.as_list::().value(index); - let scalar_values = (0..nested_array.len()) - .map(|i| ScalarValue::try_from_array(&nested_array, i)) - .collect::>>()?; - scalars.push(scalar_values); + fn generic_collect( + array: &dyn Array, + ) -> Result>> { + array + .as_list::() + .iter() + .map(|nested_array| match nested_array { + Some(nested_array) => (0..nested_array.len()) + .map(|i| ScalarValue::try_from_array(&nested_array, i)) + .collect::>>(), + // TODO: what can we put for null? + // https://github.com/apache/datafusion/issues/17749 + None => Ok(vec![]), + }) + .collect() } - Ok(scalars) + match array.data_type() { + DataType::List(_) => generic_collect::(array), + DataType::LargeList(_) => generic_collect::(array), + _ => _internal_err!( + "ScalarValue::convert_array_to_scalar_vec input must be a List/LargeList type" + ), + } } #[deprecated( @@ -3083,6 +3352,16 @@ impl ScalarValue { Ok(match array.data_type() { DataType::Null => ScalarValue::Null, + DataType::Decimal32(precision, scale) => { + ScalarValue::get_decimal_value_from_array( + array, index, *precision, *scale, + )? + } + DataType::Decimal64(precision, scale) => { + ScalarValue::get_decimal_value_from_array( + array, index, *precision, *scale, + )? + } DataType::Decimal128(precision, scale) => { ScalarValue::get_decimal_value_from_array( array, index, *precision, *scale, @@ -3197,7 +3476,7 @@ impl ScalarValue { DataType::UInt16 => get_dict_value::(array, index)?, DataType::UInt32 => get_dict_value::(array, index)?, DataType::UInt64 => get_dict_value::(array, index)?, - _ => unreachable!("Invalid dictionary keys type: {:?}", key_type), + _ => unreachable!("Invalid dictionary keys type: {}", key_type), }; // look up the index in the values dictionary let value = match values_index { @@ -3343,6 +3622,44 @@ impl ScalarValue { ScalarValue::try_from_array(&cast_arr, 0) } + fn eq_array_decimal32( + array: &ArrayRef, + index: usize, + value: Option<&i32>, + precision: u8, + scale: i8, + ) -> Result { + let array = as_decimal32_array(array)?; + if array.precision() != precision || array.scale() != scale { + return Ok(false); + } + let is_null = array.is_null(index); + if let Some(v) = value { + Ok(!array.is_null(index) && array.value(index) == *v) + } else { + Ok(is_null) + } + } + + fn eq_array_decimal64( + array: &ArrayRef, + index: usize, + value: Option<&i64>, + precision: u8, + scale: i8, + ) -> Result { + let array = as_decimal64_array(array)?; + if array.precision() != precision || array.scale() != scale { + return Ok(false); + } + let is_null = array.is_null(index); + if let Some(v) = value { + Ok(!array.is_null(index) && array.value(index) == *v) + } else { + Ok(is_null) + } + } + fn eq_array_decimal( array: &ArrayRef, index: usize, @@ -3410,6 +3727,24 @@ impl ScalarValue { #[inline] pub fn eq_array(&self, array: &ArrayRef, index: usize) -> Result { Ok(match self { + ScalarValue::Decimal32(v, precision, scale) => { + ScalarValue::eq_array_decimal32( + array, + index, + v.as_ref(), + *precision, + *scale, + )? + } + ScalarValue::Decimal64(v, precision, scale) => { + ScalarValue::eq_array_decimal64( + array, + index, + v.as_ref(), + *precision, + *scale, + )? + } ScalarValue::Decimal128(v, precision, scale) => { ScalarValue::eq_array_decimal( array, @@ -3571,7 +3906,7 @@ impl ScalarValue { DataType::UInt16 => get_dict_value::(array, index)?, DataType::UInt32 => get_dict_value::(array, index)?, DataType::UInt64 => get_dict_value::(array, index)?, - _ => unreachable!("Invalid dictionary keys type: {:?}", key_type), + _ => unreachable!("Invalid dictionary keys type: {}", key_type), }; // was the value in the array non null? match values_index { @@ -3608,6 +3943,8 @@ impl ScalarValue { | ScalarValue::Float16(_) | ScalarValue::Float32(_) | ScalarValue::Float64(_) + | ScalarValue::Decimal32(_, _, _) + | ScalarValue::Decimal64(_, _, _) | ScalarValue::Decimal128(_, _, _) | ScalarValue::Decimal256(_, _, _) | ScalarValue::Int8(_) @@ -3717,6 +4054,8 @@ impl ScalarValue { | ScalarValue::Float16(_) | ScalarValue::Float32(_) | ScalarValue::Float64(_) + | ScalarValue::Decimal32(_, _, _) + | ScalarValue::Decimal64(_, _, _) | ScalarValue::Decimal128(_, _, _) | ScalarValue::Decimal256(_, _, _) | ScalarValue::Int8(_) @@ -4230,6 +4569,12 @@ macro_rules! format_option { impl fmt::Display for ScalarValue { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + ScalarValue::Decimal32(v, p, s) => { + write!(f, "{v:?},{p:?},{s:?}")?; + } + ScalarValue::Decimal64(v, p, s) => { + write!(f, "{v:?},{p:?},{s:?}")?; + } ScalarValue::Decimal128(v, p, s) => { write!(f, "{v:?},{p:?},{s:?}")?; } @@ -4419,6 +4764,8 @@ fn fmt_binary(data: &[u8], f: &mut fmt::Formatter) -> fmt::Result { impl fmt::Debug for ScalarValue { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + ScalarValue::Decimal32(_, _, _) => write!(f, "Decimal32({self})"), + ScalarValue::Decimal64(_, _, _) => write!(f, "Decimal64({self})"), ScalarValue::Decimal128(_, _, _) => write!(f, "Decimal128({self})"), ScalarValue::Decimal256(_, _, _) => write!(f, "Decimal256({self})"), ScalarValue::Boolean(_) => write!(f, "Boolean({self})"), @@ -4614,6 +4961,8 @@ impl ScalarType for Date32Type { #[cfg(test)] mod tests { + use std::sync::Arc; + use super::*; use crate::cast::{as_list_array, as_map_array, as_struct_array}; use crate::test_util::batches_to_string; @@ -4622,7 +4971,7 @@ mod tests { NullArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveBuilder, RecordBatch, StringBuilder, StringDictionaryBuilder, StructBuilder, UnionBuilder, }; - use arrow::buffer::{Buffer, OffsetBuffer}; + use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer}; use arrow::compute::{is_null, kernels}; use arrow::datatypes::{ ArrowNumericType, Fields, Float64Type, DECIMAL256_MAX_PRECISION, @@ -8663,4 +9012,66 @@ mod tests { _ => panic!("Expected TimestampMillisecond with timezone"), } } + + #[test] + fn test_convert_array_to_scalar_vec() { + // Regular ListArray + let list = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(4)]), + ]); + let converted = ScalarValue::convert_array_to_scalar_vec(&list).unwrap(); + assert_eq!( + converted, + vec![ + vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], + vec![], + vec![ + ScalarValue::Int64(Some(3)), + ScalarValue::Int64(None), + ScalarValue::Int64(Some(4)) + ], + ] + ); + + // Regular LargeListArray + let large_list = LargeListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(4)]), + ]); + let converted = ScalarValue::convert_array_to_scalar_vec(&large_list).unwrap(); + assert_eq!( + converted, + vec![ + vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], + vec![], + vec![ + ScalarValue::Int64(Some(3)), + ScalarValue::Int64(None), + ScalarValue::Int64(Some(4)) + ], + ] + ); + + // Funky (null slot has non-zero list offsets) + // Offsets + Values looks like this: [[1, 2], [3, 4], [5]] + // But with NullBuffer it's like this: [[1, 2], NULL, [5]] + let funky = ListArray::new( + Field::new_list_field(DataType::Int64, true).into(), + OffsetBuffer::new(vec![0, 2, 4, 5].into()), + Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])), + Some(NullBuffer::from(vec![true, false, true])), + ); + let converted = ScalarValue::convert_array_to_scalar_vec(&funky).unwrap(); + assert_eq!( + converted, + vec![ + vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], + vec![], + vec![ScalarValue::Int64(Some(5))], + ] + ); + } } diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 76629e555b..5cef0adfbd 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -23,6 +23,7 @@ use crate::error::{Result, _internal_err}; use arrow::compute::can_cast_types; use arrow::datatypes::{ DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields, + DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; use std::{fmt::Display, sync::Arc}; @@ -185,7 +186,7 @@ pub enum NativeType { impl Display for NativeType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "NativeType::{self:?}") + write!(f, "{self:?}") // TODO: nicer formatting } } @@ -228,7 +229,15 @@ impl LogicalType for NativeType { (Self::Float16, _) => Float16, (Self::Float32, _) => Float32, (Self::Float64, _) => Float64, - (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s), + (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => { + Decimal32(*p, *s) + } + (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => { + Decimal64(*p, *s) + } + (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => { + Decimal128(*p, *s) + } (Self::Decimal(p, s), _) => Decimal256(*p, *s), (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()), // If given type is Date, return the same type @@ -352,10 +361,10 @@ impl LogicalType for NativeType { } _ => { return _internal_err!( - "Unavailable default cast for native type {:?} from physical type {:?}", - self, - origin - ) + "Unavailable default cast for native type {} from physical type {}", + self, + origin + ) } }) } @@ -472,4 +481,9 @@ impl NativeType { pub fn is_duration(&self) -> bool { matches!(self, NativeType::Duration(_)) } + + #[inline] + pub fn is_binary(&self) -> bool { + matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_)) + } } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 620885697b..1e5d1d104d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -149,6 +149,7 @@ parking_lot = { workspace = true } parquet = { workspace = true, optional = true, default-features = true } rand = { workspace = true } regex = { workspace = true } +rstest = { workspace = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } sqlparser = { workspace = true, optional = true } tempfile = { workspace = true } @@ -177,7 +178,7 @@ rand_distr = "0.5" regex = { workspace = true } rstest = { workspace = true } serde_json = { workspace = true } -sysinfo = "0.37.0" +sysinfo = "0.37.1" test-utils = { path = "../../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } diff --git a/datafusion/core/README.md b/datafusion/core/README.md index b5501087d2..859fcb9c0d 100644 --- a/datafusion/core/README.md +++ b/datafusion/core/README.md @@ -17,15 +17,12 @@ under the License. --> -# DataFusion Core + -DataFusion is an extensible query execution framework, written in Rust, -that uses Apache Arrow as its in-memory format. +# Apache DataFusion Core This crate contains the main entry points and high level DataFusion APIs such as `SessionContext`, `DataFrame` and `ListingTable`. - -For more information, please see: - -- [DataFusion Website](https://datafusion.apache.org) -- [DataFusion API Docs](https://docs.rs/datafusion/latest/datafusion/) diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index c71191507f..3be8668b2b 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -476,7 +476,8 @@ fn criterion_benchmark(c: &mut Criterion) { }); }); - for partitioning_columns in [4, 7, 8] { + // It was observed in production that queries with window functions sometimes partition over more than 30 columns + for partitioning_columns in [4, 7, 8, 12, 30] { c.bench_function( &format!( "physical_window_function_partition_by_{partitioning_columns}_on_values" @@ -663,6 +664,9 @@ fn criterion_benchmark(c: &mut Criterion) { }; let raw_tpcds_sql_queries = (1..100) + // skip query 75 until it is fixed + // https://github.com/apache/datafusion/issues/17801 + .filter(|q| *q != 75) .map(|q| std::fs::read_to_string(format!("{tests_path}tpc-ds/{q}.sql")).unwrap()) .collect::>(); diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 9832c0e9db..02c2c81ad5 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -43,7 +43,7 @@ use crate::physical_plan::{ use crate::prelude::SessionContext; use std::any::Any; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; @@ -2023,31 +2023,38 @@ impl DataFrame { pub fn with_column(self, name: &str, expr: Expr) -> Result { let window_func_exprs = find_window_exprs([&expr]); - let (window_fn_str, plan) = if window_func_exprs.is_empty() { - (None, self.plan) + let original_names: HashSet = self + .plan + .schema() + .iter() + .map(|(_, f)| f.name().clone()) + .collect(); + + // Maybe build window plan + let plan = if window_func_exprs.is_empty() { + self.plan } else { - ( - Some(window_func_exprs[0].to_string()), - LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?, - ) + LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)? }; - let mut col_exists = false; let new_column = expr.alias(name); + let mut col_exists = false; + let mut fields: Vec<(Expr, bool)> = plan .schema() .iter() .filter_map(|(qualifier, field)| { + // Skip new fields introduced by window_plan + if !original_names.contains(field.name()) { + return None; + } + if field.name() == name { col_exists = true; Some((new_column.clone(), true)) } else { let e = col(Column::from((qualifier, field))); - window_fn_str - .as_ref() - .filter(|s| *s == &e.to_string()) - .is_none() - .then_some((e, self.projection_requires_validation)) + Some((e, self.projection_requires_validation)) } }) .collect(); @@ -2440,8 +2447,7 @@ impl TableProvider for DataFrameTableProvider { } fn schema(&self) -> SchemaRef { - let schema: Schema = self.plan.schema().as_ref().into(); - Arc::new(schema) + Arc::clone(self.plan.schema().inner()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 735c69af77..6fb00df343 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -102,6 +102,7 @@ impl DataFrame { #[cfg(test)] mod tests { + use rstest::rstest; use std::collections::HashMap; use std::sync::Arc; @@ -247,9 +248,12 @@ mod tests { Ok(()) } + #[rstest] #[cfg(feature = "parquet_encryption")] #[tokio::test] - async fn roundtrip_parquet_with_encryption() -> Result<()> { + async fn roundtrip_parquet_with_encryption( + #[values(false, true)] allow_single_file_parallelism: bool, + ) -> Result<()> { use parquet::encryption::decrypt::FileDecryptionProperties; use parquet::encryption::encrypt::FileEncryptionProperties; @@ -278,6 +282,7 @@ mod tests { // Write encrypted parquet using write_parquet let mut options = TableParquetOptions::default(); options.crypto.file_encryption = Some((&encrypt).into()); + options.global.allow_single_file_parallelism = allow_single_file_parallelism; df.write_parquet( tempfile_str.as_str(), diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 6c4897f711..edbbea97a1 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -1233,7 +1233,7 @@ mod tests { fn csv_values(line_number: usize) -> (i32, f64, bool, String) { let int_value = line_number as i32; let float_value = line_number as f64; - let bool_value = line_number % 2 == 0; + let bool_value = line_number.is_multiple_of(2); let char_value = format!("{line_number}-string"); (int_value, float_value, bool_value, char_value) } diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 218a1fedbb..f98297d0e3 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -27,7 +27,7 @@ use crate::datasource::listing::{ }; use crate::execution::context::SessionState; -use arrow::datatypes::{DataType, SchemaRef}; +use arrow::datatypes::DataType; use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, ToDFSchema}; use datafusion_common::{config_datafusion_err, Result}; use datafusion_expr::CreateExternalTable; @@ -105,7 +105,7 @@ impl TableProviderFactory for ListingTableFactory { .collect::>(), ) } else { - let schema: SchemaRef = Arc::new(cmd.schema.as_ref().to_owned().into()); + let schema = Arc::clone(cmd.schema.inner()); let table_partition_cols = cmd .table_partition_cols .iter() @@ -238,6 +238,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -278,6 +279,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -322,6 +324,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -373,6 +376,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -416,6 +420,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -455,6 +460,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], @@ -495,6 +501,7 @@ mod tests { schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, + or_replace: false, temporary: false, definition: None, order_exprs: vec![], diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 6f1c6c4171..7c9767ceec 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -2018,14 +2018,14 @@ mod tests { let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out"; fs::create_dir(&out_dir).unwrap(); let df = ctx.sql("SELECT c1, c2 FROM test").await?; - let schema: Schema = df.schema().into(); + let schema = Arc::clone(df.schema().inner()); // Register a listing table - this will use all files in the directory as data sources // for the query ctx.register_listing_table( "my_table", &out_dir, listing_options, - Some(Arc::new(schema)), + Some(schema), None, ) .await diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index e7a66c4f9e..011b84aa51 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -792,19 +792,44 @@ impl SessionContext { return not_impl_err!("Temporary tables not supported"); } - if exist { - match cmd.if_not_exists { - true => return self.return_empty_dataframe(), - false => { - return exec_err!("Table '{}' already exists", cmd.name); + match (cmd.if_not_exists, cmd.or_replace, exist) { + (true, false, true) => self.return_empty_dataframe(), + (false, true, true) => { + let result = self + .find_and_deregister(cmd.name.clone(), TableType::Base) + .await; + + match result { + Ok(true) => { + let table_provider: Arc = + self.create_custom_table(cmd).await?; + self.register_table(cmd.name.clone(), table_provider)?; + self.return_empty_dataframe() + } + Ok(false) => { + let table_provider: Arc = + self.create_custom_table(cmd).await?; + self.register_table(cmd.name.clone(), table_provider)?; + self.return_empty_dataframe() + } + Err(e) => { + exec_err!("Errored while deregistering external table: {}", e) + } } } + (true, true, true) => { + exec_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'") + } + (_, _, false) => { + let table_provider: Arc = + self.create_custom_table(cmd).await?; + self.register_table(cmd.name.clone(), table_provider)?; + self.return_empty_dataframe() + } + (false, false, true) => { + exec_err!("External table '{}' already exists", cmd.name) + } } - - let table_provider: Arc = - self.create_custom_table(cmd).await?; - self.register_table(cmd.name.clone(), table_provider)?; - self.return_empty_dataframe() } async fn create_memory_table(&self, cmd: CreateMemoryTable) -> Result { @@ -830,7 +855,7 @@ impl SessionContext { (true, false, Ok(_)) => self.return_empty_dataframe(), (false, true, Ok(_)) => { self.deregister_table(name.clone())?; - let schema = Arc::new(input.schema().as_ref().into()); + let schema = Arc::clone(input.schema().inner()); let physical = DataFrame::new(self.state(), input); let batches: Vec<_> = physical.collect_partitioned().await?; @@ -848,8 +873,7 @@ impl SessionContext { exec_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'") } (_, _, Err(_)) => { - let df_schema = input.schema(); - let schema = Arc::new(df_schema.as_ref().into()); + let schema = Arc::clone(input.schema().inner()); let physical = DataFrame::new(self.state(), input); let batches: Vec<_> = physical.collect_partitioned().await?; @@ -1730,6 +1754,14 @@ impl FunctionRegistry for SessionContext { ) -> Result<()> { self.state.write().register_expr_planner(expr_planner) } + + fn udafs(&self) -> HashSet { + self.state.read().udafs() + } + + fn udwfs(&self) -> HashSet { + self.state.read().udwfs() + } } /// Create a new task context instance from SessionContext diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f658290904..b04004dd49 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1917,6 +1917,14 @@ impl FunctionRegistry for SessionState { self.expr_planners.push(expr_planner); Ok(()) } + + fn udafs(&self) -> HashSet { + self.aggregate_functions.keys().cloned().collect() + } + + fn udwfs(&self) -> HashSet { + self.window_functions.keys().cloned().collect() + } } impl OptimizerConfig for SessionState { diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 15d325288b..6b4d2592f6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -59,12 +59,10 @@ use crate::schema_equivalence::schema_satisfied_by; use arrow::array::{builder::StringBuilder, RecordBatch}; use arrow::compute::SortOptions; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow::datatypes::Schema; use datafusion_catalog::ScanArgs; use datafusion_common::display::ToStringifiedPlan; -use datafusion_common::tree_node::{ - Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeVisitor, -}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion_common::TableReference; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, @@ -85,7 +83,7 @@ use datafusion_expr::{ WindowFrameBound, WriteOp, }; use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; -use datafusion_physical_expr::expressions::{Column, Literal}; +use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::{ create_physical_sort_exprs, LexOrdering, PhysicalSortExpr, }; @@ -468,7 +466,6 @@ impl DefaultPhysicalPlanner { Arc::clone(res.plan()) } LogicalPlan::Values(Values { values, schema }) => { - let exec_schema = schema.as_ref().to_owned().into(); let exprs = values .iter() .map(|row| { @@ -479,27 +476,23 @@ impl DefaultPhysicalPlanner { .collect::>>>() }) .collect::>>()?; - MemorySourceConfig::try_new_as_values(SchemaRef::new(exec_schema), exprs)? + MemorySourceConfig::try_new_as_values(Arc::clone(schema.inner()), exprs)? as _ } LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema, - }) => Arc::new(EmptyExec::new(SchemaRef::new( - schema.as_ref().to_owned().into(), - ))), + }) => Arc::new(EmptyExec::new(Arc::clone(schema.inner()))), LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: true, schema, - }) => Arc::new(PlaceholderRowExec::new(SchemaRef::new( - schema.as_ref().to_owned().into(), - ))), + }) => Arc::new(PlaceholderRowExec::new(Arc::clone(schema.inner()))), LogicalPlan::DescribeTable(DescribeTable { schema, output_schema, }) => { - let output_schema: Schema = output_schema.as_ref().into(); - self.plan_describe(Arc::clone(schema), Arc::new(output_schema))? + let output_schema = Arc::clone(output_schema.inner()); + self.plan_describe(Arc::clone(schema), output_schema)? } // 1 Child @@ -516,7 +509,7 @@ impl DefaultPhysicalPlanner { let parsed_url = ListingTableUrl::parse(output_url)?; let object_store_url = parsed_url.object_store(); - let schema: Schema = (**input.schema()).clone().into(); + let schema = Arc::clone(input.schema().inner()); // Note: the DataType passed here is ignored for the purposes of writing and inferred instead // from the schema of the RecordBatch being written. This allows COPY statements to specify only @@ -553,7 +546,7 @@ impl DefaultPhysicalPlanner { object_store_url, table_paths: vec![parsed_url], file_group: FileGroup::default(), - output_schema: Arc::new(schema), + output_schema: schema, table_partition_cols, insert_op: InsertOp::Append, keep_partition_by_columns, @@ -741,9 +734,54 @@ impl DefaultPhysicalPlanner { }) .collect::>>()?; - let (aggregates, filters, _order_bys): (Vec<_>, Vec<_>, Vec<_>) = + let (mut aggregates, filters, _order_bys): (Vec<_>, Vec<_>, Vec<_>) = multiunzip(agg_filter); + let mut async_exprs = Vec::new(); + let num_input_columns = physical_input_schema.fields().len(); + + for agg_func in &mut aggregates { + match self.try_plan_async_exprs( + num_input_columns, + PlannedExprResult::Expr(agg_func.expressions()), + physical_input_schema.as_ref(), + )? { + PlanAsyncExpr::Async( + async_map, + PlannedExprResult::Expr(physical_exprs), + ) => { + async_exprs.extend(async_map.async_exprs); + + if let Some(new_agg_func) = agg_func.with_new_expressions( + physical_exprs, + agg_func + .order_bys() + .iter() + .cloned() + .map(|x| x.expr) + .collect(), + ) { + *agg_func = Arc::new(new_agg_func); + } else { + return internal_err!("Failed to plan async expression"); + } + } + PlanAsyncExpr::Sync(PlannedExprResult::Expr(_)) => { + // Do nothing + } + _ => { + return internal_err!( + "Unexpected result from try_plan_async_exprs" + ) + } + } + } + let input_exec = if !async_exprs.is_empty() { + Arc::new(AsyncFuncExec::try_new(async_exprs, input_exec)?) + } else { + input_exec + }; + let initial_aggr = Arc::new(AggregateExec::try_new( AggregateMode::Partial, groups.clone(), @@ -933,7 +971,7 @@ impl DefaultPhysicalPlanner { .. }) => { let input = children.one()?; - let schema = SchemaRef::new(schema.as_ref().to_owned().into()); + let schema = Arc::clone(schema.inner()); let list_column_indices = list_type_columns .iter() .map(|(index, unnesting)| ListUnnest { @@ -1641,7 +1679,7 @@ pub fn create_window_expr_with_name( execution_props: &ExecutionProps, ) -> Result> { let name = name.into(); - let physical_schema: &Schema = &logical_schema.into(); + let physical_schema = Arc::clone(logical_schema.inner()); match e { Expr::WindowFunction(window_fun) => { let WindowFunction { @@ -2033,7 +2071,7 @@ impl DefaultPhysicalPlanner { session_state: &SessionState, ) -> Result> { let input = self.create_physical_plan(&a.input, session_state).await?; - let schema = SchemaRef::new((*a.schema).clone().into()); + let schema = Arc::clone(a.schema.inner()); let show_statistics = session_state.config_options().explain.show_statistics; Ok(Arc::new(AnalyzeExec::new( a.verbose, @@ -2097,7 +2135,15 @@ impl DefaultPhysicalPlanner { "Optimized physical plan:\n{}\n", displayable(new_plan.as_ref()).indent(false) ); - debug!("Detailed optimized physical plan:\n{new_plan:?}"); + + // Don't print new_plan directly, as that may overflow the stack. + // For example: + // thread 'tokio-runtime-worker' has overflowed its stack + // fatal runtime error: stack overflow, aborting + debug!( + "Detailed optimized physical plan:\n{}\n", + displayable(new_plan.as_ref()).indent(true) + ); Ok(new_plan) } @@ -2115,7 +2161,7 @@ impl DefaultPhysicalPlanner { // "System supplied type" --> Use debug format of the datatype let data_type = field.data_type(); - data_types.append_value(format!("{data_type:?}")); + data_types.append_value(format!("{data_type}")); // "YES if the column is possibly nullable, NO if it is known not nullable. " let nullable_str = if field.is_nullable() { "YES" } else { "NO" }; @@ -2181,11 +2227,7 @@ impl DefaultPhysicalPlanner { let physical_expr = self.create_physical_expr(e, input_logical_schema, session_state); - // Check for possible column name mismatches - let final_physical_expr = - maybe_fix_physical_column_name(physical_expr, &input_physical_schema); - - tuple_err((final_physical_expr, physical_name)) + tuple_err((physical_expr, physical_name)) }) .collect::>>()?; @@ -2272,11 +2314,13 @@ impl DefaultPhysicalPlanner { } } +#[derive(Debug)] enum PlannedExprResult { ExprWithName(Vec<(Arc, String)>), Expr(Vec>), } +#[derive(Debug)] enum PlanAsyncExpr { Sync(PlannedExprResult), Async(AsyncMapper, PlannedExprResult), @@ -2291,47 +2335,6 @@ fn tuple_err(value: (Result, Result)) -> Result<(T, R)> { } } -// Handle the case where the name of a physical column expression does not match the corresponding physical input fields names. -// Physical column names are derived from the physical schema, whereas physical column expressions are derived from the logical column names. -// -// This is a special case that applies only to column expressions. Logical plans may slightly modify column names by appending a suffix (e.g., using ':'), -// to avoid duplicates—since DFSchemas do not allow duplicate names. For example: `count(Int64(1)):1`. -fn maybe_fix_physical_column_name( - expr: Result>, - input_physical_schema: &SchemaRef, -) -> Result> { - let Ok(expr) = expr else { return expr }; - expr.transform_down(|node| { - if let Some(column) = node.as_any().downcast_ref::() { - let idx = column.index(); - let physical_field = input_physical_schema.field(idx); - let expr_col_name = column.name(); - let physical_name = physical_field.name(); - - if expr_col_name != physical_name { - // handle edge cases where the physical_name contains ':'. - let colon_count = physical_name.matches(':').count(); - let mut splits = expr_col_name.match_indices(':'); - let split_pos = splits.nth(colon_count); - - if let Some((i, _)) = split_pos { - let base_name = &expr_col_name[..i]; - if base_name == physical_name { - let updated_column = Column::new(physical_name, idx); - return Ok(Transformed::yes(Arc::new(updated_column))); - } - } - } - - // If names already match or fix is not possible, just leave it as it is - Ok(Transformed::no(node)) - } else { - Ok(Transformed::no(node)) - } - }) - .data() -} - struct OptimizationInvariantChecker<'a> { rule: &'a Arc, } @@ -2429,18 +2432,17 @@ mod tests { use crate::execution::session_state::SessionStateBuilder; use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type}; + use arrow_schema::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_common::{ assert_contains, DFSchemaRef, TableReference, ToDFSchema as _, }; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; - use datafusion_expr::{ - col, lit, LogicalPlanBuilder, Operator, UserDefinedLogicalNodeCore, - }; + use datafusion_expr::builder::subquery_alias; + use datafusion_expr::{col, lit, LogicalPlanBuilder, UserDefinedLogicalNodeCore}; use datafusion_functions_aggregate::count::count_all; use datafusion_functions_aggregate::expr_fn::sum; - use datafusion_physical_expr::expressions::{BinaryExpr, IsNotNullExpr}; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; @@ -2742,7 +2744,7 @@ mod tests { assert_contains!( &e, - r#"Error during planning: Can not find compatible types to compare Boolean with [Struct([Field { name: "foo", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), Utf8]"# + r#"Error during planning: Can not find compatible types to compare Boolean with [Struct(foo Boolean), Utf8]"# ); Ok(()) @@ -3001,71 +3003,6 @@ mod tests { } } - #[tokio::test] - async fn test_maybe_fix_colon_in_physical_name() { - // The physical schema has a field name with a colon - let schema = Schema::new(vec![Field::new("metric:avg", DataType::Int32, false)]); - let schema_ref: SchemaRef = Arc::new(schema); - - // What might happen after deduplication - let logical_col_name = "metric:avg:1"; - let expr_with_suffix = - Arc::new(Column::new(logical_col_name, 0)) as Arc; - let expr_result = Ok(expr_with_suffix); - - // Call function under test - let fixed_expr = - maybe_fix_physical_column_name(expr_result, &schema_ref).unwrap(); - - // Downcast back to Column so we can check the name - let col = fixed_expr - .as_any() - .downcast_ref::() - .expect("Column"); - - assert_eq!(col.name(), "metric:avg"); - } - - #[tokio::test] - async fn test_maybe_fix_nested_column_name_with_colon() { - let schema = Schema::new(vec![Field::new("column", DataType::Int32, false)]); - let schema_ref: SchemaRef = Arc::new(schema); - - // Construct the nested expr - let col_expr = Arc::new(Column::new("column:1", 0)) as Arc; - let is_not_null_expr = Arc::new(IsNotNullExpr::new(col_expr.clone())); - - // Create a binary expression and put the column inside - let binary_expr = Arc::new(BinaryExpr::new( - is_not_null_expr.clone(), - Operator::Or, - is_not_null_expr.clone(), - )) as Arc; - - let fixed_expr = - maybe_fix_physical_column_name(Ok(binary_expr), &schema_ref).unwrap(); - - let bin = fixed_expr - .as_any() - .downcast_ref::() - .expect("Expected BinaryExpr"); - - // Check that both sides where renamed - for expr in &[bin.left(), bin.right()] { - let is_not_null = expr - .as_any() - .downcast_ref::() - .expect("Expected IsNotNull"); - - let col = is_not_null - .arg() - .as_any() - .downcast_ref::() - .expect("Expected Column"); - - assert_eq!(col.name(), "column"); - } - } struct ErrorExtensionPlanner {} #[async_trait] @@ -3562,4 +3499,61 @@ digraph { Ok(()) } + + // Reproducer for DataFusion issue #17405: + // + // The following SQL is semantically invalid. Notably, the `SELECT left_table.a, right_table.a` + // clause is missing from the explicit logical plan: + // + // SELECT a FROM ( + // -- SELECT left_table.a, right_table.a + // FROM left_table + // FULL JOIN right_table ON left_table.a = right_table.a + // ) AS alias + // GROUP BY a; + // + // As a result, the variables within `alias` subquery are not properly distinguished, which + // leads to a bug for logical and physical planning. + // + // The fix is to implicitly insert a Projection node to represent the missing SELECT clause to + // ensure each field is correctly aliased to a unique name when the SubqueryAlias node is added. + #[tokio::test] + async fn subquery_alias_confusing_the_optimizer() -> Result<()> { + let state = make_session_state(); + + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let schema = Arc::new(schema); + + let table = MemTable::try_new(schema.clone(), vec![vec![]])?; + let table = Arc::new(table); + + let source = DefaultTableSource::new(table); + let source = Arc::new(source); + + let left = LogicalPlanBuilder::scan("left", source.clone(), None)?; + let right = LogicalPlanBuilder::scan("right", source, None)?.build()?; + + let join_keys = ( + vec![datafusion_common::Column::new(Some("left"), "a")], + vec![datafusion_common::Column::new(Some("right"), "a")], + ); + + let join = left.join(right, JoinType::Full, join_keys, None)?.build()?; + + let alias = subquery_alias(join, "alias")?; + + let planner = DefaultPhysicalPlanner::default(); + + let logical_plan = LogicalPlanBuilder::new(alias) + .aggregate(vec![col("a:1")], Vec::::new())? + .build()?; + let _physical_plan = planner.create_physical_plan(&logical_plan, &state).await?; + + let optimized_logical_plan = state.optimize(&logical_plan)?; + let _optimized_physical_plan = planner + .create_physical_plan(&optimized_logical_plan, &state) + .await?; + + Ok(()) + } } diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index 299b73ccbe..7149c5b0bd 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -185,7 +185,7 @@ impl TableProviderFactory for TestTableFactory { ) -> Result> { Ok(Arc::new(TestTableProvider { url: cmd.location.to_string(), - schema: Arc::new(cmd.schema.as_ref().into()), + schema: Arc::clone(cmd.schema.inner()), })) } } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index aa984775e4..e9b531723f 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -27,18 +27,19 @@ use arrow::array::{ }; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{ - DataType, Field, Float32Type, Int32Type, Schema, SchemaRef, UInt64Type, UnionFields, - UnionMode, + DataType, Field, Float32Type, Int32Type, Schema, UInt64Type, UnionFields, UnionMode, }; use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; +use arrow_schema::{SortOptions, TimeUnit}; use datafusion::{assert_batches_eq, dataframe}; use datafusion_functions_aggregate::count::{count_all, count_all_window}; use datafusion_functions_aggregate::expr_fn::{ - array_agg, avg, count, count_distinct, max, median, min, sum, + array_agg, avg, avg_distinct, count, count_distinct, max, median, min, sum, + sum_distinct, }; use datafusion_functions_nested::make_array::make_array_udf; -use datafusion_functions_window::expr_fn::{first_value, row_number}; +use datafusion_functions_window::expr_fn::{first_value, lead, row_number}; use insta::assert_snapshot; use object_store::local::LocalFileSystem; use std::collections::HashMap; @@ -63,8 +64,8 @@ use datafusion::test_util::{ use datafusion_catalog::TableProvider; use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; use datafusion_common::{ - assert_contains, Constraint, Constraints, DataFusionError, ParamValues, ScalarValue, - TableReference, UnnestOptions, + assert_contains, Constraint, Constraints, DFSchema, DataFusionError, ParamValues, + ScalarValue, TableReference, UnnestOptions, }; use datafusion_common_runtime::SpawnedTask; use datafusion_datasource::file_format::format_as_file_type; @@ -80,10 +81,19 @@ use datafusion_expr::{ LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; +use datafusion_physical_expr::aggregate::AggregateExprBuilder; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::Partitioning; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; -use datafusion_physical_plan::{displayable, ExecutionPlanProperties}; +use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; +use datafusion_physical_plan::aggregates::{ + AggregateExec, AggregateMode, PhysicalGroupBy, +}; +use datafusion_physical_plan::empty::EmptyExec; +use datafusion_physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties}; + +use datafusion::error::Result as DataFusionResult; +use datafusion_functions_window::expr_fn::lag; // Get string representation of the plan async fn physical_plan_to_string(df: &DataFrame) -> String { @@ -119,8 +129,7 @@ pub fn table_with_constraints() -> Arc { } async fn assert_logical_expr_schema_eq_physical_expr_schema(df: DataFrame) -> Result<()> { - let logical_expr_dfschema = df.schema(); - let logical_expr_schema = SchemaRef::from(logical_expr_dfschema.to_owned()); + let logical_expr_schema = Arc::clone(df.schema().inner()); let batches = df.collect().await?; let physical_expr_schema = batches[0].schema(); assert_eq!(logical_expr_schema, physical_expr_schema); @@ -152,6 +161,46 @@ async fn test_array_agg_ord_schema() -> Result<()> { Ok(()) } +type WindowFnCase = (fn() -> Expr, &'static str); + +#[tokio::test] +async fn with_column_window_functions() -> DataFusionResult<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + + let ctx = SessionContext::new(); + + let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]])?; + ctx.register_table("t", Arc::new(provider))?; + + // Define test cases: (expr builder, alias name) + let test_cases: Vec = vec![ + (|| lag(col("a"), Some(1), None), "lag_val"), + (|| lead(col("a"), Some(1), None), "lead_val"), + (row_number, "row_num"), + ]; + + for (make_expr, alias) in test_cases { + let df = ctx.table("t").await?; + let expr = make_expr(); + let df_with = df.with_column(alias, expr)?; + let df_schema = df_with.schema().clone(); + + assert!( + df_schema.has_column_with_unqualified_name(alias), + "Schema does not contain expected column {alias}", + ); + + assert_eq!(2, df_schema.columns().len()); + } + + Ok(()) +} + #[tokio::test] async fn test_coalesce_schema() -> Result<()> { let ctx = SessionContext::new(); @@ -497,32 +546,35 @@ async fn drop_with_periods() -> Result<()> { #[tokio::test] async fn aggregate() -> Result<()> { // build plan using DataFrame API - let df = test_table().await?; + // union so some of the distincts have a clearly distinct result + let df = test_table().await?.union(test_table().await?)?; let group_expr = vec![col("c1")]; let aggr_expr = vec![ - min(col("c12")), - max(col("c12")), - avg(col("c12")), - sum(col("c12")), - count(col("c12")), - count_distinct(col("c12")), + min(col("c4")).alias("min(c4)"), + max(col("c4")).alias("max(c4)"), + avg(col("c4")).alias("avg(c4)"), + avg_distinct(col("c4")).alias("avg_distinct(c4)"), + sum(col("c4")).alias("sum(c4)"), + sum_distinct(col("c4")).alias("sum_distinct(c4)"), + count(col("c4")).alias("count(c4)"), + count_distinct(col("c4")).alias("count_distinct(c4)"), ]; let df: Vec = df.aggregate(group_expr, aggr_expr)?.collect().await?; assert_snapshot!( batches_to_sort_string(&df), - @r###" - +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ - | c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) | - +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ - | a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 | - | b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 | - | c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 | - | d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 | - | e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 | - +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ - "### + @r" + +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+ + | c1 | min(c4) | max(c4) | avg(c4) | avg_distinct(c4) | sum(c4) | sum_distinct(c4) | count(c4) | count_distinct(c4) | + +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+ + | a | -28462 | 32064 | 306.04761904761904 | 306.04761904761904 | 12854 | 6427 | 42 | 21 | + | b | -28070 | 25286 | 7732.315789473684 | 7732.315789473684 | 293828 | 146914 | 38 | 19 | + | c | -30508 | 29106 | -1320.5238095238096 | -1320.5238095238096 | -55462 | -27731 | 42 | 21 | + | d | -24558 | 31106 | 10890.111111111111 | 10890.111111111111 | 392044 | 196022 | 36 | 18 | + | e | -31500 | 32514 | -4268.333333333333 | -4268.333333333333 | -179270 | -89635 | 42 | 21 | + +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+ + " ); Ok(()) @@ -537,7 +589,9 @@ async fn aggregate_assert_no_empty_batches() -> Result<()> { min(col("c12")), max(col("c12")), avg(col("c12")), + avg_distinct(col("c12")), sum(col("c12")), + sum_distinct(col("c12")), count(col("c12")), count_distinct(col("c12")), median(col("c12")), @@ -613,12 +667,12 @@ async fn test_aggregate_with_pk2() -> Result<()> { let df = df.filter(predicate)?; assert_snapshot!( physical_plan_to_string(&df).await, - @r###" - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: id@0 = 1 AND name@1 = a - AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + @r" + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=Sorted + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 AND name@1 = a DataSourceExec: partitions=1, partition_sizes=[1] - "### + " ); // Since id and name are functionally dependant, we can use name among expression @@ -662,12 +716,12 @@ async fn test_aggregate_with_pk3() -> Result<()> { let df = df.select(vec![col("id"), col("name")])?; assert_snapshot!( physical_plan_to_string(&df).await, - @r###" - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: id@0 = 1 - AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + @r" + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=PartiallySorted([0]) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 DataSourceExec: partitions=1, partition_sizes=[1] - "### + " ); // Since id and name are functionally dependant, we can use name among expression @@ -713,12 +767,12 @@ async fn test_aggregate_with_pk4() -> Result<()> { // columns are not used. assert_snapshot!( physical_plan_to_string(&df).await, - @r###" - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: id@0 = 1 - AggregateExec: mode=Single, gby=[id@0 as id], aggr=[] + @r" + AggregateExec: mode=Single, gby=[id@0 as id], aggr=[], ordering_mode=Sorted + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 DataSourceExec: partitions=1, partition_sizes=[1] - "### + " ); let df_results = df.collect().await?; @@ -6323,3 +6377,105 @@ async fn test_copy_to_preserves_order() -> Result<()> { ); Ok(()) } + +#[tokio::test] +async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> { + let ctx = SessionContext::new(); + + // Simple schema with just the fields we need + let file_schema = Arc::new(Schema::new(vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ), + Field::new("ticker", DataType::Utf8, true), + Field::new("value", DataType::Float64, true), + Field::new("date", DataType::Utf8, false), + ])); + + let df_schema = DFSchema::try_from(file_schema.clone())?; + + let timestamp = col("timestamp"); + let value = col("value"); + let ticker = col("ticker"); + let date = col("date"); + + let mock_exec = Arc::new(EmptyExec::new(file_schema.clone())); + + // Build first_value aggregate + let first_value = Arc::new( + AggregateExprBuilder::new( + datafusion_functions_aggregate::first_last::first_value_udaf(), + vec![ctx.create_physical_expr(value.clone(), &df_schema)?], + ) + .alias("first_value(value)") + .order_by(vec![PhysicalSortExpr::new( + ctx.create_physical_expr(timestamp.clone(), &df_schema)?, + SortOptions::new(false, false), + )]) + .schema(file_schema.clone()) + .build() + .expect("Failed to build first_value"), + ); + + // Build last_value aggregate + let last_value = Arc::new( + AggregateExprBuilder::new( + datafusion_functions_aggregate::first_last::last_value_udaf(), + vec![ctx.create_physical_expr(value.clone(), &df_schema)?], + ) + .alias("last_value(value)") + .order_by(vec![PhysicalSortExpr::new( + ctx.create_physical_expr(timestamp.clone(), &df_schema)?, + SortOptions::new(false, false), + )]) + .schema(file_schema.clone()) + .build() + .expect("Failed to build last_value"), + ); + + let partial_agg = AggregateExec::try_new( + AggregateMode::Partial, + PhysicalGroupBy::new_single(vec![ + ( + ctx.create_physical_expr(date.clone(), &df_schema)?, + "date".to_string(), + ), + ( + ctx.create_physical_expr(ticker.clone(), &df_schema)?, + "ticker".to_string(), + ), + ]), + vec![first_value, last_value], + vec![None, None], + mock_exec, + file_schema, + ) + .expect("Failed to build partial agg"); + + // Assert that the schema field names match the expected names + let expected_field_names = vec![ + "date", + "ticker", + "first_value(value)[first_value]", + "timestamp@0", + "is_set", + "last_value(value)[last_value]", + "timestamp@0", + "is_set", + ]; + + let binding = partial_agg.schema(); + let actual_field_names: Vec<_> = binding.fields().iter().map(|f| f.name()).collect(); + assert_eq!(actual_field_names, expected_field_names); + + // Ensure that DFSchema::try_from does not fail + let partial_agg_exec_schema = DFSchema::try_from(partial_agg.schema()); + assert!( + partial_agg_exec_schema.is_ok(), + "Expected get AggregateExec schema to succeed with duplicate state fields" + ); + + Ok(()) +} diff --git a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs index e7f63b5351..45dba5f786 100644 --- a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs +++ b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs @@ -20,18 +20,19 @@ use std::sync::Arc; use arrow::array::{ArrayRef, DictionaryArray, PrimitiveArray, RecordBatch}; use arrow::datatypes::{ ArrowPrimitiveType, BooleanType, DataType, Date32Type, Date64Type, Decimal128Type, - Decimal256Type, DurationMicrosecondType, DurationMillisecondType, - DurationNanosecondType, DurationSecondType, Field, Float32Type, Float64Type, - Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, - IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, Schema, - Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, - TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, + Decimal256Type, Decimal32Type, Decimal64Type, DurationMicrosecondType, + DurationMillisecondType, DurationNanosecondType, DurationSecondType, Field, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, + Schema, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, + Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, + DECIMAL256_MAX_SCALE, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, + DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, }; use datafusion_common::{arrow_datafusion_err, DataFusionError, Result}; use rand::{rng, rngs::StdRng, Rng, SeedableRng}; @@ -104,6 +105,20 @@ pub fn get_supported_types_columns(rng_seed: u64) -> Vec { "duration_nanosecond", DataType::Duration(TimeUnit::Nanosecond), ), + ColumnDescr::new("decimal32", { + let precision: u8 = rng.random_range(1..=DECIMAL32_MAX_PRECISION); + let scale: i8 = rng.random_range( + i8::MIN..=std::cmp::min(precision as i8, DECIMAL32_MAX_SCALE), + ); + DataType::Decimal32(precision, scale) + }), + ColumnDescr::new("decimal64", { + let precision: u8 = rng.random_range(1..=DECIMAL64_MAX_PRECISION); + let scale: i8 = rng.random_range( + i8::MIN..=std::cmp::min(precision as i8, DECIMAL64_MAX_SCALE), + ); + DataType::Decimal64(precision, scale) + }), ColumnDescr::new("decimal128", { let precision: u8 = rng.random_range(1..=DECIMAL128_MAX_PRECISION); let scale: i8 = rng.random_range( @@ -682,6 +697,32 @@ impl RecordBatchGenerator { _ => unreachable!(), } } + DataType::Decimal32(precision, scale) => { + generate_decimal_array!( + self, + num_rows, + max_num_distinct, + null_pct, + batch_gen_rng, + array_gen_rng, + precision, + scale, + Decimal32Type + ) + } + DataType::Decimal64(precision, scale) => { + generate_decimal_array!( + self, + num_rows, + max_num_distinct, + null_pct, + batch_gen_rng, + array_gen_rng, + precision, + scale, + Decimal64Type + ) + } DataType::Decimal128(precision, scale) => { generate_decimal_array!( self, diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index b659d868ce..7f994daeaa 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -232,10 +232,10 @@ impl RunQueryResult { } } -/// Iterate over each line in the plan and check that one of them has `DataSourceExec` and `DynamicFilterPhysicalExpr` in the same line. +/// Iterate over each line in the plan and check that one of them has `DataSourceExec` and `DynamicFilter` in the same line. fn has_dynamic_filter_expr_pushdown(plan: &str) -> bool { for line in plan.lines() { - if line.contains("DataSourceExec") && line.contains("DynamicFilterPhysicalExpr") { + if line.contains("DataSourceExec") && line.contains("DynamicFilter") { return true; } } diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index 9da92f7cd4..65a41d39d3 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -286,7 +286,7 @@ async fn bounded_window_causal_non_causal() -> Result<()> { &partitionby_exprs, &[], Arc::new(window_frame), - &extended_schema, + extended_schema, false, false, None, @@ -660,7 +660,7 @@ async fn run_window_test( &partitionby_exprs, &orderby_exprs.clone(), Arc::new(window_frame.clone()), - &extended_schema, + Arc::clone(&extended_schema), false, false, None, @@ -680,7 +680,7 @@ async fn run_window_test( &partitionby_exprs, &orderby_exprs, Arc::new(window_frame.clone()), - &extended_schema, + extended_schema, false, false, None, diff --git a/datafusion/core/tests/macro_hygiene/mod.rs b/datafusion/core/tests/macro_hygiene/mod.rs index 09fb38b72e..c9f33f6fdf 100644 --- a/datafusion/core/tests/macro_hygiene/mod.rs +++ b/datafusion/core/tests/macro_hygiene/mod.rs @@ -73,6 +73,7 @@ mod config_field { #[test] fn test_macro() { #[derive(Debug)] + #[allow(dead_code)] struct E; impl std::fmt::Display for E { diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs index a8002cf400..14c8fc4c41 100644 --- a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs +++ b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs @@ -98,11 +98,9 @@ fn init_once() { fn spawn_test_process(test: &str) { init_once(); - let test_path = format!( - "memory_limit::memory_limit_validation::sort_mem_validation::{}", - test - ); - info!("Running test: {}", test_path); + let test_path = + format!("memory_limit::memory_limit_validation::sort_mem_validation::{test}"); + info!("Running test: {test_path}"); // Run the test command let output = Command::new("cargo") diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index b8d570916c..b769fec7d3 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -26,8 +26,6 @@ //! select * from data limit 10; //! ``` -use std::path::Path; - use arrow::compute::concat_batches; use arrow::record_batch::RecordBatch; use datafusion::physical_plan::collect; @@ -37,7 +35,10 @@ use datafusion::prelude::{ }; use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile}; use datafusion_expr::utils::{conjunction, disjunction, split_conjunction}; +use std::path::Path; +use datafusion_common::test_util::parquet_test_data; +use datafusion_execution::config::SessionConfig; use itertools::Itertools; use parquet::file::properties::WriterProperties; use tempfile::TempDir; @@ -601,3 +602,99 @@ fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { } } } + +#[tokio::test] +async fn predicate_cache_default() -> datafusion_common::Result<()> { + let ctx = SessionContext::new(); + // The cache is on by default, but not used unless filter pushdown is enabled + PredicateCacheTest { + expected_inner_records: 0, + expected_records: 0, + } + .run(&ctx) + .await +} + +#[tokio::test] +async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + let ctx = SessionContext::new_with_config(config); + // The cache is on by default, and used when filter pushdown is enabled + PredicateCacheTest { + expected_inner_records: 8, + expected_records: 4, + } + .run(&ctx) + .await +} + +#[tokio::test] +async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { + // Can disable the cache even with filter pushdown by setting the size to 0. In this case we + // expect the inner records are reported but no records are read from the cache + let mut config = SessionConfig::new(); + config.options_mut().execution.parquet.pushdown_filters = true; + config + .options_mut() + .execution + .parquet + .max_predicate_cache_size = Some(0); + let ctx = SessionContext::new_with_config(config); + PredicateCacheTest { + // file has 8 rows, which need to be read twice, one for filter, one for + // final output + expected_inner_records: 16, + // Expect this to 0 records read as the cache is disabled. However, it is + // non zero due to https://github.com/apache/arrow-rs/issues/8307 + expected_records: 3, + } + .run(&ctx) + .await +} + +/// Runs the query "SELECT * FROM alltypes_plain WHERE double_col != 0.0" +/// with a given SessionContext and asserts that the predicate cache metrics +/// are as expected +#[derive(Debug)] +struct PredicateCacheTest { + /// Expected records read from the underlying reader (to evaluate filters) + /// -- this is the total number of records in the file + expected_inner_records: usize, + /// Expected records to be read from the cache (after filtering) + expected_records: usize, +} + +impl PredicateCacheTest { + async fn run(self, ctx: &SessionContext) -> datafusion_common::Result<()> { + let Self { + expected_inner_records, + expected_records, + } = self; + // Create a dataframe that scans the "alltypes_plain.parquet" file with + // a filter on `double_col != 0.0` + let path = parquet_test_data() + "/alltypes_plain.parquet"; + let exec = ctx + .read_parquet(path, ParquetReadOptions::default()) + .await? + .filter(col("double_col").not_eq(lit(0.0)))? + .create_physical_plan() + .await?; + + // run the plan to completion + let _ = collect(exec.clone(), ctx.task_ctx()).await?; // run plan + let metrics = + TestParquetFile::parquet_metrics(&exec).expect("found parquet metrics"); + + // verify the predicate cache metrics + assert_eq!( + get_value(&metrics, "predicate_cache_inner_records"), + expected_inner_records + ); + assert_eq!( + get_value(&metrics, "predicate_cache_records"), + expected_records + ); + Ok(()) + } +} diff --git a/datafusion/core/tests/parquet/schema_adapter.rs b/datafusion/core/tests/parquet/schema_adapter.rs index f685ccdc9f..a25ed7131e 100644 --- a/datafusion/core/tests/parquet/schema_adapter.rs +++ b/datafusion/core/tests/parquet/schema_adapter.rs @@ -119,7 +119,7 @@ impl SchemaMapper for CustomSchemaMapper { let default_value = match field.data_type() { DataType::Int64 => ScalarValue::Int64(Some(0)), DataType::Utf8 => ScalarValue::Utf8(Some("a".to_string())), - _ => unimplemented!("Unsupported data type: {:?}", field.data_type()), + _ => unimplemented!("Unsupported data type: {}", field.data_type()), }; output_columns .push(default_value.to_array_of_size(batch.num_rows()).unwrap()); @@ -199,7 +199,7 @@ impl PhysicalExprAdapter for CustomPhysicalExprAdapter { DataType::Int64 => ScalarValue::Int64(Some(1)), DataType::Utf8 => ScalarValue::Utf8(Some("b".to_string())), _ => unimplemented!( - "Unsupported data type: {:?}", + "Unsupported data type: {}", field.data_type() ), }; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index a19dd7ace9..3858e70eaf 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -3683,7 +3683,7 @@ async fn test_window_partial_constant_and_set_monotonicity() -> Result<()> { &partition_by, &[], case.window_frame, - input_schema.as_ref(), + Arc::clone(&input_schema), false, false, None, diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 9f588519ec..41ddfaa100 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -248,7 +248,7 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() { - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] AND DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ] " ); @@ -271,7 +271,7 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() { - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilterPhysicalExpr [ e@1 IS NULL OR e@1 < bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ] " ); } @@ -708,7 +708,7 @@ async fn test_topk_dynamic_filter_pushdown() { output: Ok: - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] " ); @@ -734,7 +734,7 @@ async fn test_topk_dynamic_filter_pushdown() { format!("{}", format_plan_for_test(&plan)), @r" - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false], filter=[b@1 > bd] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@1 > bd ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 > bd ] " ); } @@ -791,7 +791,7 @@ async fn test_topk_dynamic_filter_pushdown_multi_column_sort() { output: Ok: - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] " ); @@ -827,7 +827,7 @@ async fn test_topk_dynamic_filter_pushdown_multi_column_sort() { format!("{}", format_plan_for_test(&plan)), @r" - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false], filter=[b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac)] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ] " ); // There should be no more batches @@ -911,7 +911,7 @@ async fn test_hashjoin_dynamic_filter_pushdown() { Ok: - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] ", ); @@ -945,7 +945,7 @@ async fn test_hashjoin_dynamic_filter_pushdown() { @r" - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] " ); } @@ -1139,7 +1139,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] " ); @@ -1176,7 +1176,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ] " ); @@ -1193,7 +1193,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] " ); @@ -1336,7 +1336,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] " ); @@ -1370,7 +1370,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() { - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1 - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ] " ); @@ -1507,8 +1507,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() { - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] ", ); @@ -1538,8 +1538,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() { - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)] - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@0 >= aa AND b@0 <= ab ] - - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ d@0 >= ca AND d@0 <= cb ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb ] " ); } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 0a75d9f52e..c51a5e02c9 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -1281,7 +1281,7 @@ fn test_hash_join_after_projection() -> Result<()> { &JoinType::Inner, None, PartitionMode::Auto, - NullEquality::NullEqualsNull, + NullEquality::NullEqualsNothing, )?); let projection: Arc = Arc::new(ProjectionExec::try_new( vec![ diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 7c9fb9de53..b906dfa4b9 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -236,7 +236,7 @@ pub fn hash_join_exec( join_type, None, PartitionMode::Partitioned, - NullEquality::NullEqualsNull, + NullEquality::NullEqualsNothing, )?)) } @@ -263,7 +263,7 @@ pub fn bounded_window_exec_with_partition( partition_by, &sort_exprs, Arc::new(WindowFrame::new(Some(false))), - schema.as_ref(), + schema, false, false, None, diff --git a/datafusion/core/tests/sql/sql_api.rs b/datafusion/core/tests/sql/sql_api.rs index ec086bcc50..b87afd27dd 100644 --- a/datafusion/core/tests/sql/sql_api.rs +++ b/datafusion/core/tests/sql/sql_api.rs @@ -84,8 +84,8 @@ async fn dml_output_schema() { ctx.sql("CREATE TABLE test (x int)").await.unwrap(); let sql = "INSERT INTO test VALUES (1)"; let df = ctx.sql(sql).await.unwrap(); - let count_schema = Schema::new(vec![Field::new("count", DataType::UInt64, false)]); - assert_eq!(Schema::from(df.schema()), count_schema); + let count_schema = &Schema::new(vec![Field::new("count", DataType::UInt64, false)]); + assert_eq!(df.schema().as_arrow(), count_schema); } #[tokio::test] diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index a5b073b147..db70caf525 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -379,13 +379,13 @@ async fn test_user_defined_functions_with_alias() -> Result<()> { let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?; - insta::assert_snapshot!(batches_to_string(&alias_result), @r###" - +------------+ - | dummy(t.i) | - +------------+ - | 1.0 | - +------------+ - "###); + insta::assert_snapshot!(batches_to_string(&alias_result), @r" + +------------------+ + | dummy_alias(t.i) | + +------------------+ + | 1.0 | + +------------------+ + "); Ok(()) } diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index e1c7a875e0..1c155853b2 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -478,13 +478,13 @@ async fn test_user_defined_functions_with_alias() -> Result<()> { "###); let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?; - insta::assert_snapshot!(batches_to_string(&alias_result), @r###" - +------------+ - | dummy(t.i) | - +------------+ - | 1 | - +------------+ - "###); + insta::assert_snapshot!(batches_to_string(&alias_result), @r" + +------------------+ + | dummy_alias(t.i) | + +------------------+ + | 1 | + +------------------+ + "); Ok(()) } @@ -1833,6 +1833,87 @@ async fn test_config_options_work_for_scalar_func() -> Result<()> { Ok(()) } +/// https://github.com/apache/datafusion/issues/17425 +#[tokio::test] +async fn test_extension_metadata_preserve_in_sql_values() -> Result<()> { + #[derive(Debug, Hash, PartialEq, Eq)] + struct MakeExtension { + signature: Signature, + } + + impl Default for MakeExtension { + fn default() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } + } + + impl ScalarUDFImpl for MakeExtension { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_extension" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + Ok(arg_types.to_vec()) + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + unreachable!("This shouldn't have been called") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(args.arg_fields[0] + .as_ref() + .clone() + .with_metadata(HashMap::from([( + "ARROW:extension:metadata".to_string(), + "foofy.foofy".to_string(), + )])) + .into()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(args.args[0].clone()) + } + } + + let ctx = SessionContext::new(); + ctx.register_udf(MakeExtension::default().into()); + + let batches = ctx + .sql( + " +SELECT extension FROM (VALUES + ('one', make_extension('foofy one')), + ('two', make_extension('foofy two')), + ('three', make_extension('foofy three'))) +AS t(string, extension) + ", + ) + .await? + .collect() + .await?; + + assert_eq!( + batches[0] + .schema() + .field(0) + .metadata() + .get("ARROW:extension:metadata"), + Some(&"foofy.foofy".into()) + ); + Ok(()) +} + /// https://github.com/apache/datafusion/issues/17422 #[tokio::test] async fn test_extension_metadata_preserve_in_subquery() -> Result<()> { diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs index b3542f4da8..555b57fbe6 100644 --- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs @@ -145,22 +145,22 @@ async fn test_udwf_with_alias() { .await .unwrap(); - insta::assert_snapshot!(batches_to_string(&actual), @r###" - +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ - | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW | - +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ - | 1 | a | 0 | 1 | - | 1 | b | 1 | 1 | - | 1 | c | 2 | 1 | - | 2 | d | 3 | 2 | - | 2 | e | 4 | 2 | - | 2 | f | 5 | 2 | - | 2 | g | 6 | 2 | - | 2 | h | 6 | 2 | - | 2 | i | 6 | 2 | - | 2 | j | 6 | 2 | - +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+ - "###); + insta::assert_snapshot!(batches_to_string(&actual), @r" + +---+---+-----+--------------------------+ + | x | y | val | odd_counter_alias(t.val) | + +---+---+-----+--------------------------+ + | 1 | a | 0 | 1 | + | 1 | b | 1 | 1 | + | 1 | c | 2 | 1 | + | 2 | d | 3 | 2 | + | 2 | e | 4 | 2 | + | 2 | f | 5 | 2 | + | 2 | g | 6 | 2 | + | 2 | h | 6 | 2 | + | 2 | i | 6 | 2 | + | 2 | j | 6 | 2 | + +---+---+-----+--------------------------+ + "); } /// Basic user defined window function with bounded window diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index ab04c68513..e013e8a3d0 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-datasource-avro" description = "datafusion-datasource-avro" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/datasource-avro/README.md b/datafusion/datasource-avro/README.md index 3436d4a85a..e9b8affe60 100644 --- a/datafusion/datasource-avro/README.md +++ b/datafusion/datasource-avro/README.md @@ -17,15 +17,17 @@ under the License. --> -# DataFusion datasource +# Apache DataFusion Avro DataSource -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. -This crate is a submodule of DataFusion that defines a Avro based file source. +This crate is a submodule of DataFusion that defines an [Apache Avro] based file source. Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[apache avro]: https://avro.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs index 598484d215..a80f18cf81 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs @@ -153,7 +153,7 @@ impl AvroArrowArrayReader<'_, R> { .map(|value| match value { Ok(Value::Record(v)) => Ok(v), Err(e) => Err(ArrowError::ParseError(format!( - "Failed to parse avro value: {e:?}" + "Failed to parse avro value: {e}" ))), other => Err(ArrowError::ParseError(format!( "Row needs to be of type object, got: {other:?}" @@ -281,7 +281,7 @@ impl AvroArrowArrayReader<'_, R> { self.list_array_string_array_builder::(&dtype, col_name, rows) } ref e => Err(SchemaError(format!( - "Data type is currently not supported for dictionaries in list : {e:?}" + "Data type is currently not supported for dictionaries in list : {e}" ))), } } @@ -308,7 +308,7 @@ impl AvroArrowArrayReader<'_, R> { } e => { return Err(SchemaError(format!( - "Nested list data builder type is not supported: {e:?}" + "Nested list data builder type is not supported: {e}" ))) } }; @@ -373,7 +373,7 @@ impl AvroArrowArrayReader<'_, R> { } e => { return Err(SchemaError(format!( - "Nested list data builder type is not supported: {e:?}" + "Nested list data builder type is not supported: {e}" ))) } } @@ -610,7 +610,7 @@ impl AvroArrowArrayReader<'_, R> { } datatype => { return Err(SchemaError(format!( - "Nested list of {datatype:?} not supported" + "Nested list of {datatype} not supported" ))); } }; @@ -831,7 +831,7 @@ impl AvroArrowArrayReader<'_, R> { } _ => { return Err(SchemaError(format!( - "type {:?} not supported", + "type {} not supported", field.data_type() ))) } @@ -936,7 +936,7 @@ fn resolve_string(v: &Value) -> ArrowResult> { Value::Null => Ok(None), other => Err(AvroError::new(AvroErrorDetails::GetString(other.clone()))), } - .map_err(|e| SchemaError(format!("expected resolvable string : {e:?}"))) + .map_err(|e| SchemaError(format!("expected resolvable string : {e}"))) } fn resolve_u8(v: &Value) -> Option { diff --git a/datafusion/datasource-csv/Cargo.toml b/datafusion/datasource-csv/Cargo.toml index 978cfb5efe..209cea4038 100644 --- a/datafusion/datasource-csv/Cargo.toml +++ b/datafusion/datasource-csv/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-datasource-csv" description = "datafusion-datasource-csv" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/datasource-csv/README.md b/datafusion/datasource-csv/README.md index 0ebddb5386..8bdadd0fe2 100644 --- a/datafusion/datasource-csv/README.md +++ b/datafusion/datasource-csv/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion datasource +# Apache DataFusion CSV DataSource -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that defines a CSV based file source. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/datasource-json/Cargo.toml b/datafusion/datasource-json/Cargo.toml index bc4a624c74..987ab60c70 100644 --- a/datafusion/datasource-json/Cargo.toml +++ b/datafusion/datasource-json/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-datasource-json" description = "datafusion-datasource-json" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/datasource-json/README.md b/datafusion/datasource-json/README.md index ac0b73b78e..ca2771b9d6 100644 --- a/datafusion/datasource-json/README.md +++ b/datafusion/datasource-json/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion datasource +# Apache DataFusion JSON DataSource -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that defines a JSON based file source. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml index 690995a631..1f866ffd6c 100644 --- a/datafusion/datasource-parquet/Cargo.toml +++ b/datafusion/datasource-parquet/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-datasource-parquet" description = "datafusion-datasource-parquet" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/datasource-parquet/README.md b/datafusion/datasource-parquet/README.md index 9ac472a9f4..833fc74a25 100644 --- a/datafusion/datasource-parquet/README.md +++ b/datafusion/datasource-parquet/README.md @@ -17,15 +17,17 @@ under the License. --> -# DataFusion datasource +# Apache DataFusion Parquet DataSource -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. -This crate is a submodule of DataFusion that defines a Parquet based file source. +This crate is a submodule of DataFusion that defines an [Apache Parquet] based file source. Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[apache parquet]: https://parquet.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1fcc172101..963c1d7795 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -71,11 +71,11 @@ use object_store::buffered::BufWriter; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_writer::{ - compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, - ArrowLeafColumn, ArrowWriterOptions, + compute_leaves, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, + ArrowRowGroupWriterFactory, ArrowWriterOptions, }; use parquet::arrow::async_reader::MetadataFetch; -use parquet::arrow::{ArrowSchemaConverter, AsyncArrowWriter}; +use parquet::arrow::{ArrowWriter, AsyncArrowWriter}; use parquet::basic::Type; use crate::metadata::DFParquetMetadata; @@ -1128,14 +1128,7 @@ impl ParquetSink { runtime: &Arc, path: &Path, ) -> Result { - let schema = if self.parquet_options.global.allow_single_file_parallelism { - // If parallelizing writes, we may be also be doing hive style partitioning - // into multiple files which impacts the schema per file. - // Refer to `get_writer_schema()` - &get_writer_schema(&self.config) - } else { - self.config.output_schema() - }; + let schema = self.config.output_schema(); // TODO: avoid this clone in follow up PR, where the writer properties & schema // are calculated once on `ParquetSink::new` @@ -1249,16 +1242,6 @@ impl FileSink for ParquetSink { object_store: Arc, ) -> Result { let parquet_opts = &self.parquet_options; - let mut allow_single_file_parallelism = - parquet_opts.global.allow_single_file_parallelism; - - if parquet_opts.crypto.file_encryption.is_some() - || parquet_opts.crypto.factory_id.is_some() - { - // For now, arrow-rs does not support parallel writes with encryption - // See https://github.com/apache/arrow-rs/issues/7359 - allow_single_file_parallelism = false; - } let mut file_write_tasks: JoinSet< std::result::Result<(Path, FileMetaData), DataFusionError>, @@ -1276,7 +1259,7 @@ impl FileSink for ParquetSink { while let Some((path, mut rx)) = file_stream_rx.recv().await { let parquet_props = self.create_writer_props(&runtime, &path).await?; - if !allow_single_file_parallelism { + if !parquet_opts.global.allow_single_file_parallelism { let mut writer = self .create_async_arrow_writer( &path, @@ -1316,6 +1299,7 @@ impl FileSink for ParquetSink { .build()?; let schema = get_writer_schema(&self.config); let props = parquet_props.clone(); + let skip_arrow_metadata = self.parquet_options.global.skip_arrow_metadata; let parallel_options_clone = parallel_options.clone(); let pool = Arc::clone(context.memory_pool()); file_write_tasks.spawn(async move { @@ -1324,6 +1308,7 @@ impl FileSink for ParquetSink { rx, schema, &props, + skip_arrow_metadata, parallel_options_clone, pool, ) @@ -1404,13 +1389,10 @@ type ColSender = Sender; /// Returns join handles for each columns serialization task along with a send channel /// to send arrow arrays to each serialization task. fn spawn_column_parallel_row_group_writer( - schema: Arc, - parquet_props: Arc, + col_writers: Vec, max_buffer_size: usize, pool: &Arc, ) -> Result<(Vec, Vec)> { - let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; - let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; let num_columns = col_writers.len(); let mut col_writer_tasks = Vec::with_capacity(num_columns); @@ -1505,6 +1487,7 @@ fn spawn_rg_join_and_finalize_task( /// across both columns and row_groups, with a theoretical max number of parallel tasks /// given by n_columns * num_row_groups. fn spawn_parquet_parallel_serialization_task( + row_group_writer_factory: ArrowRowGroupWriterFactory, mut data: Receiver, serialize_tx: Sender>, schema: Arc, @@ -1515,13 +1498,11 @@ fn spawn_parquet_parallel_serialization_task( SpawnedTask::spawn(async move { let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream; let max_row_group_rows = writer_props.max_row_group_size(); + let mut row_group_index = 0; + let col_writers = + row_group_writer_factory.create_column_writers(row_group_index)?; let (mut column_writer_handles, mut col_array_channels) = - spawn_column_parallel_row_group_writer( - Arc::clone(&schema), - Arc::clone(&writer_props), - max_buffer_rb, - &pool, - )?; + spawn_column_parallel_row_group_writer(col_writers, max_buffer_rb, &pool)?; let mut current_rg_rows = 0; while let Some(mut rb) = data.recv().await { @@ -1567,10 +1548,12 @@ fn spawn_parquet_parallel_serialization_task( current_rg_rows = 0; rb = rb.slice(rows_left, rb.num_rows() - rows_left); + row_group_index += 1; + let col_writers = row_group_writer_factory + .create_column_writers(row_group_index)?; (column_writer_handles, col_array_channels) = spawn_column_parallel_row_group_writer( - Arc::clone(&schema), - Arc::clone(&writer_props), + col_writers, max_buffer_rb, &pool, )?; @@ -1601,29 +1584,21 @@ fn spawn_parquet_parallel_serialization_task( /// Consume RowGroups serialized by other parallel tasks and concatenate them in /// to the final parquet file, while flushing finalized bytes to an [ObjectStore] async fn concatenate_parallel_row_groups( + mut parquet_writer: SerializedFileWriter, + merged_buff: SharedBuffer, mut serialize_rx: Receiver>, - schema: Arc, - writer_props: Arc, mut object_store_writer: Box, pool: Arc, ) -> Result { - let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES); - let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); - let schema_desc = ArrowSchemaConverter::new().convert(schema.as_ref())?; - let mut parquet_writer = SerializedFileWriter::new( - merged_buff.clone(), - schema_desc.root_schema_ptr(), - writer_props, - )?; - while let Some(task) = serialize_rx.recv().await { let result = task.join_unwind().await; - let mut rg_out = parquet_writer.next_row_group()?; let (serialized_columns, mut rg_reservation, _cnt) = result.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??; + + let mut rg_out = parquet_writer.next_row_group()?; for chunk in serialized_columns { chunk.append_to_row_group(&mut rg_out)?; rg_reservation.free(); @@ -1661,6 +1636,7 @@ async fn output_single_parquet_file_parallelized( data: Receiver, output_schema: Arc, parquet_props: &WriterProperties, + skip_arrow_metadata: bool, parallel_options: ParallelParquetWriterOptions, pool: Arc, ) -> Result { @@ -1670,7 +1646,19 @@ async fn output_single_parquet_file_parallelized( mpsc::channel::>(max_rowgroups); let arc_props = Arc::new(parquet_props.clone()); + let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES); + let options = ArrowWriterOptions::new() + .with_properties(parquet_props.clone()) + .with_skip_arrow_metadata(skip_arrow_metadata); + let writer = ArrowWriter::try_new_with_options( + merged_buff.clone(), + Arc::clone(&output_schema), + options, + )?; + let (writer, row_group_writer_factory) = writer.into_serialized_writer()?; + let launch_serialization_task = spawn_parquet_parallel_serialization_task( + row_group_writer_factory, data, serialize_tx, Arc::clone(&output_schema), @@ -1679,9 +1667,9 @@ async fn output_single_parquet_file_parallelized( Arc::clone(&pool), ); let file_metadata = concatenate_parallel_row_groups( + writer, + merged_buff, serialize_rx, - Arc::clone(&output_schema), - Arc::clone(&arc_props), object_store_writer, pool, ) diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index 81d5511d69..4de68793ce 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -39,7 +39,9 @@ use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::arrow::parquet_to_arrow_schema; -use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; +use parquet::file::metadata::{ + PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, +}; use std::any::Any; use std::collections::HashMap; use std::sync::Arc; @@ -148,7 +150,7 @@ impl<'a> DFParquetMetadata<'a> { if cache_metadata && file_metadata_cache.is_some() { // Need to retrieve the entire metadata for the caching to be effective. - reader = reader.with_page_indexes(true); + reader = reader.with_page_index_policy(PageIndexPolicy::Optional); } let metadata = Arc::new( diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 574fe2a040..d75a979d4c 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -72,6 +72,13 @@ pub struct ParquetFileMetrics { pub page_index_eval_time: Time, /// Total time spent reading and parsing metadata from the footer pub metadata_load_time: Time, + /// Predicate Cache: number of records read directly from the inner reader. + /// This is the number of rows decoded while evaluating predicates + pub predicate_cache_inner_records: Count, + /// Predicate Cache: number of records read from the cache. This is the + /// number of rows that were stored in the cache after evaluating predicates + /// reused for the output. + pub predicate_cache_records: Count, } impl ParquetFileMetrics { @@ -140,6 +147,14 @@ impl ParquetFileMetrics { let files_ranges_pruned_statistics = MetricBuilder::new(metrics) .counter("files_ranges_pruned_statistics", partition); + let predicate_cache_inner_records = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("predicate_cache_inner_records", partition); + + let predicate_cache_records = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("predicate_cache_records", partition); + Self { files_ranges_pruned_statistics, predicate_evaluation_errors, @@ -157,6 +172,8 @@ impl ParquetFileMetrics { bloom_filter_eval_time, page_index_eval_time, metadata_load_time, + predicate_cache_inner_records, + predicate_cache_records, } } } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index bd9406588c..aed7293831 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -51,10 +51,11 @@ use datafusion_execution::parquet_encryption::EncryptionFactory; use futures::{ready, Stream, StreamExt, TryStreamExt}; use itertools::Itertools; use log::debug; +use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::ParquetMetaDataReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { @@ -106,6 +107,9 @@ pub(super) struct ParquetOpener { #[cfg(feature = "parquet_encryption")] pub encryption_factory: Option<(Arc, EncryptionFactoryOptions)>, + /// Maximum size of the predicate cache, in bytes. If none, uses + /// the arrow-rs default. + pub max_predicate_cache_size: Option, } impl FileOpener for ParquetOpener { @@ -154,6 +158,7 @@ impl FileOpener for ParquetOpener { let enable_page_index = self.enable_page_index; #[cfg(feature = "parquet_encryption")] let encryption_context = self.get_encryption_context(); + let max_predicate_cache_size = self.max_predicate_cache_size; Ok(Box::pin(async move { #[cfg(feature = "parquet_encryption")] @@ -404,21 +409,42 @@ impl FileOpener for ParquetOpener { builder = builder.with_limit(limit) } + if let Some(max_predicate_cache_size) = max_predicate_cache_size { + builder = builder.with_max_predicate_cache_size(max_predicate_cache_size); + } + + // metrics from the arrow reader itself + let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + let stream = builder .with_projection(mask) .with_batch_size(batch_size) .with_row_groups(row_group_indexes) + .with_metrics(arrow_reader_metrics.clone()) .build()?; - let stream = stream - .map_err(DataFusionError::from) - .map(move |b| b.and_then(|b| schema_mapping.map_batch(b))); + let files_ranges_pruned_statistics = + file_metrics.files_ranges_pruned_statistics.clone(); + let predicate_cache_inner_records = + file_metrics.predicate_cache_inner_records.clone(); + let predicate_cache_records = file_metrics.predicate_cache_records.clone(); + + let stream = stream.map_err(DataFusionError::from).map(move |b| { + b.and_then(|b| { + copy_arrow_reader_metrics( + &arrow_reader_metrics, + &predicate_cache_inner_records, + &predicate_cache_records, + ); + schema_mapping.map_batch(b) + }) + }); if let Some(file_pruner) = file_pruner { Ok(EarlyStoppingStream::new( stream, file_pruner, - file_metrics.files_ranges_pruned_statistics.clone(), + files_ranges_pruned_statistics, ) .boxed()) } else { @@ -428,6 +454,22 @@ impl FileOpener for ParquetOpener { } } +/// Copies metrics from ArrowReaderMetrics (the metrics collected by the +/// arrow-rs parquet reader) to the parquet file metrics for DataFusion +fn copy_arrow_reader_metrics( + arrow_reader_metrics: &ArrowReaderMetrics, + predicate_cache_inner_records: &Count, + predicate_cache_records: &Count, +) { + if let Some(v) = arrow_reader_metrics.records_read_from_inner() { + predicate_cache_inner_records.add(v); + } + + if let Some(v) = arrow_reader_metrics.records_read_from_cache() { + predicate_cache_records.add(v); + } +} + /// Wraps an inner RecordBatchStream and a [`FilePruner`] /// /// This can terminate the scan early when some dynamic filters is updated after @@ -658,8 +700,8 @@ async fn load_page_index( if missing_column_index || missing_offset_index { let m = Arc::try_unwrap(Arc::clone(parquet_metadata)) .unwrap_or_else(|e| e.as_ref().clone()); - let mut reader = - ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); + let mut reader = ParquetMetaDataReader::new_with_metadata(m) + .with_page_index_policy(PageIndexPolicy::Optional); reader.load_page_index(input).await?; let new_parquet_metadata = reader.finish()?; let new_arrow_reader = @@ -830,6 +872,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -919,6 +962,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1024,6 +1068,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; let make_meta = || FileMeta { @@ -1139,6 +1184,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1255,6 +1301,7 @@ mod test { expr_adapter_factory: Some(Arc::new(DefaultPhysicalExprAdapterFactory)), #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, } }; @@ -1438,6 +1485,7 @@ mod test { expr_adapter_factory: None, #[cfg(feature = "parquet_encryption")] encryption_factory: None, + max_predicate_cache_size: None, }; let predicate = logical2physical(&col("a").eq(lit(1u64)), &table_schema); diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index 166e6d8b0b..660b32f486 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -76,7 +76,7 @@ use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor} use datafusion_common::Result; use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaMapper}; use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::utils::reassign_predicate_columns; +use datafusion_physical_expr::utils::reassign_expr_columns; use datafusion_physical_expr::{split_conjunction, PhysicalExpr}; use datafusion_physical_plan::metrics; @@ -119,9 +119,8 @@ impl DatafusionArrowPredicate { rows_matched: metrics::Count, time: metrics::Time, ) -> Result { - let projected_schema = Arc::clone(&candidate.filter_schema); let physical_expr = - reassign_predicate_columns(candidate.expr, &projected_schema, true)?; + reassign_expr_columns(candidate.expr, &candidate.filter_schema)?; Ok(Self { physical_expr, diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 644cea85ca..f32725aa66 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -427,6 +427,12 @@ impl ParquetSource { self.table_parquet_options.global.bloom_filter_on_read } + /// Return the maximum predicate cache size, in bytes, used when + /// `pushdown_filters` + pub fn max_predicate_cache_size(&self) -> Option { + self.table_parquet_options.global.max_predicate_cache_size + } + /// Applies schema adapter factory from the FileScanConfig if present. /// /// # Arguments @@ -583,6 +589,7 @@ impl FileSource for ParquetSource { expr_adapter_factory, #[cfg(feature = "parquet_encryption")] encryption_factory: self.get_encryption_factory_with_config(), + max_predicate_cache_size: self.max_predicate_cache_size(), }) } diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 8df722a28a..3f207d4e65 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-datasource" description = "datafusion-datasource" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/datasource/README.md b/datafusion/datasource/README.md index 5d743bc830..cf0bb7547c 100644 --- a/datafusion/datasource/README.md +++ b/datafusion/datasource/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion datasource +# Apache DataFusion DataSource -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that defines common DataSource related components like FileScanConfig, FileCompression etc. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 4e2235eae8..d1940402ee 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -48,11 +48,14 @@ use datafusion_common::{ use datafusion_execution::{ object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, }; -use datafusion_physical_expr::{expressions::Column, utils::reassign_predicate_columns}; -use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; +use datafusion_expr::Operator; +use datafusion_physical_expr::expressions::BinaryExpr; +use datafusion_physical_expr::{expressions::Column, utils::reassign_expr_columns}; +use datafusion_physical_expr::{split_conjunction, EquivalenceProperties, Partitioning}; use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; +use datafusion_physical_plan::filter_pushdown::FilterPushdownPropagation; use datafusion_physical_plan::projection::ProjectionExpr; use datafusion_physical_plan::{ display::{display_orderings, ProjectSchemaDisplay}, @@ -60,9 +63,6 @@ use datafusion_physical_plan::{ projection::{all_alias_free_columns, new_projections_for_columns}, DisplayAs, DisplayFormatType, }; -use datafusion_physical_plan::{ - filter::collect_columns_from_predicate, filter_pushdown::FilterPushdownPropagation, -}; use datafusion_physical_plan::coop::cooperative; use datafusion_physical_plan::execution_plan::SchedulingType; @@ -588,27 +588,14 @@ impl DataSource for FileScanConfig { if let Some(filter) = self.file_source.filter() { // We need to remap column indexes to match the projected schema since that's what the equivalence properties deal with. // Note that this will *ignore* any non-projected columns: these don't factor into ordering / equivalence. - match reassign_predicate_columns(filter, &schema, true) { - Ok(filter) => { - match Self::add_filter_equivalence_info( - filter, - &mut eq_properties, - &schema, - ) { - Ok(()) => {} - Err(e) => { - warn!("Failed to add filter equivalence info: {e}"); - #[cfg(debug_assertions)] - panic!("Failed to add filter equivalence info: {e}"); - } - } - } + match Self::add_filter_equivalence_info(filter, &mut eq_properties, &schema) { + Ok(()) => {} Err(e) => { - warn!("Failed to reassign predicate columns: {e}"); + warn!("Failed to add filter equivalence info: {e}"); #[cfg(debug_assertions)] - panic!("Failed to reassign predicate columns: {e}"); + panic!("Failed to add filter equivalence info: {e}"); } - }; + } } eq_properties } @@ -764,24 +751,24 @@ impl FileScanConfig { eq_properties: &mut EquivalenceProperties, schema: &Schema, ) -> Result<()> { - macro_rules! ignore_dangling_col { - ($col:expr) => { - if let Some(col) = $col.as_any().downcast_ref::() { - if schema.index_of(col.name()).is_err() { - continue; + // Gather valid equality pairs from the filter expression + let equal_pairs = split_conjunction(&filter).into_iter().filter_map(|expr| { + // Ignore any binary expressions that reference non-existent columns in the current schema + // (e.g. due to unnecessary projections being removed) + reassign_expr_columns(Arc::clone(expr), schema) + .ok() + .and_then(|expr| match expr.as_any().downcast_ref::() { + Some(expr) if expr.op() == &Operator::Eq => { + Some((Arc::clone(expr.left()), Arc::clone(expr.right()))) } - } - }; - } + _ => None, + }) + }); - let (equal_pairs, _) = collect_columns_from_predicate(&filter); for (lhs, rhs) in equal_pairs { - // Ignore any binary expressions that reference non-existent columns in the current schema - // (e.g. due to unnecessary projections being removed) - ignore_dangling_col!(lhs); - ignore_dangling_col!(rhs); - eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))? + eq_properties.add_equal_conditions(lhs, rhs)? } + Ok(()) } diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs index bd5833bb78..4c7b37113d 100644 --- a/datafusion/datasource/src/schema_adapter.rs +++ b/datafusion/datasource/src/schema_adapter.rs @@ -267,7 +267,7 @@ pub(crate) fn can_cast_field( Ok(true) } else { plan_err!( - "Cannot cast file schema field {} of type {:?} to table schema field of type {:?}", + "Cannot cast file schema field {} of type {} to table schema field of type {}", file_field.name(), file_field.data_type(), table_field.data_type() diff --git a/datafusion/doc/Cargo.toml b/datafusion/doc/Cargo.toml index fa316348a6..b8324565a0 100644 --- a/datafusion/doc/Cargo.toml +++ b/datafusion/doc/Cargo.toml @@ -19,6 +19,7 @@ name = "datafusion-doc" description = "Documentation module for DataFusion query engine" keywords = ["datafusion", "query", "sql"] +readme = "README.md" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } diff --git a/datafusion/doc/README.md b/datafusion/doc/README.md index c81a8e78c6..f137a273e3 100644 --- a/datafusion/doc/README.md +++ b/datafusion/doc/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Execution +# Apache DataFusion Documentation -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides structures and macros for documenting user defined functions. @@ -28,5 +28,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/doc/src/lib.rs b/datafusion/doc/src/lib.rs index 9a2c5656ba..943be7a71c 100644 --- a/datafusion/doc/src/lib.rs +++ b/datafusion/doc/src/lib.rs @@ -21,6 +21,14 @@ )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] +mod udaf; +mod udf; +mod udwf; + +pub use udaf::aggregate_doc_sections; +pub use udf::scalar_doc_sections; +pub use udwf::window_doc_sections; + #[allow(rustdoc::broken_intra_doc_links)] /// Documentation for use by [`ScalarUDFImpl`](ScalarUDFImpl), /// [`AggregateUDFImpl`](AggregateUDFImpl) and [`WindowUDFImpl`](WindowUDFImpl) functions. diff --git a/datafusion/doc/src/udaf.rs b/datafusion/doc/src/udaf.rs new file mode 100644 index 0000000000..c3a0b4adbc --- /dev/null +++ b/datafusion/doc/src/udaf.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Aggregate UDF doc sections for use in public documentation +pub mod aggregate_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_GENERAL, + DOC_SECTION_STATISTICAL, + DOC_SECTION_APPROXIMATE, + ] + } + + pub const DOC_SECTION_GENERAL: DocSection = DocSection { + include: true, + label: "General Functions", + description: None, + }; + + pub const DOC_SECTION_STATISTICAL: DocSection = DocSection { + include: true, + label: "Statistical Functions", + description: None, + }; + + pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection { + include: true, + label: "Approximate Functions", + description: None, + }; +} diff --git a/datafusion/doc/src/udf.rs b/datafusion/doc/src/udf.rs new file mode 100644 index 0000000000..3d18c9ac27 --- /dev/null +++ b/datafusion/doc/src/udf.rs @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Scalar UDF doc sections for use in public documentation +pub mod scalar_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_MATH, + DOC_SECTION_CONDITIONAL, + DOC_SECTION_STRING, + DOC_SECTION_BINARY_STRING, + DOC_SECTION_REGEX, + DOC_SECTION_DATETIME, + DOC_SECTION_ARRAY, + DOC_SECTION_STRUCT, + DOC_SECTION_MAP, + DOC_SECTION_HASHING, + DOC_SECTION_UNION, + DOC_SECTION_OTHER, + ] + } + + pub const fn doc_sections_const() -> &'static [DocSection] { + &[ + DOC_SECTION_MATH, + DOC_SECTION_CONDITIONAL, + DOC_SECTION_STRING, + DOC_SECTION_BINARY_STRING, + DOC_SECTION_REGEX, + DOC_SECTION_DATETIME, + DOC_SECTION_ARRAY, + DOC_SECTION_STRUCT, + DOC_SECTION_MAP, + DOC_SECTION_HASHING, + DOC_SECTION_UNION, + DOC_SECTION_OTHER, + ] + } + + pub const DOC_SECTION_MATH: DocSection = DocSection { + include: true, + label: "Math Functions", + description: None, + }; + + pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection { + include: true, + label: "Conditional Functions", + description: None, + }; + + pub const DOC_SECTION_STRING: DocSection = DocSection { + include: true, + label: "String Functions", + description: None, + }; + + pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection { + include: true, + label: "Binary String Functions", + description: None, + }; + + pub const DOC_SECTION_REGEX: DocSection = DocSection { + include: true, + label: "Regular Expression Functions", + description: Some( + r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported:"#, + ), + }; + + pub const DOC_SECTION_DATETIME: DocSection = DocSection { + include: true, + label: "Time and Date Functions", + description: None, + }; + + pub const DOC_SECTION_ARRAY: DocSection = DocSection { + include: true, + label: "Array Functions", + description: None, + }; + + pub const DOC_SECTION_STRUCT: DocSection = DocSection { + include: true, + label: "Struct Functions", + description: None, + }; + + pub const DOC_SECTION_MAP: DocSection = DocSection { + include: true, + label: "Map Functions", + description: None, + }; + + pub const DOC_SECTION_HASHING: DocSection = DocSection { + include: true, + label: "Hashing Functions", + description: None, + }; + + pub const DOC_SECTION_OTHER: DocSection = DocSection { + include: true, + label: "Other Functions", + description: None, + }; + + pub const DOC_SECTION_UNION: DocSection = DocSection { + include: true, + label: "Union Functions", + description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"), + }; +} diff --git a/datafusion/doc/src/udwf.rs b/datafusion/doc/src/udwf.rs new file mode 100644 index 0000000000..0257ce5ba6 --- /dev/null +++ b/datafusion/doc/src/udwf.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Window UDF doc sections for use in public documentation +pub mod window_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_AGGREGATE, + DOC_SECTION_RANKING, + DOC_SECTION_ANALYTICAL, + ] + } + + pub const DOC_SECTION_AGGREGATE: DocSection = DocSection { + include: true, + label: "Aggregate Functions", + description: Some("All aggregate functions can be used as window functions."), + }; + + pub const DOC_SECTION_RANKING: DocSection = DocSection { + include: true, + label: "Ranking Functions", + description: None, + }; + + pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection { + include: true, + label: "Analytical Functions", + description: None, + }; +} diff --git a/datafusion/execution/README.md b/datafusion/execution/README.md index dd82e206e6..5b1528b0da 100644 --- a/datafusion/execution/README.md +++ b/datafusion/execution/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Execution +# Apache DataFusion Execution -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides execution runtime such as the memory pools and disk manager. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index b11596c4a3..19f97f9e79 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -201,6 +201,14 @@ impl FunctionRegistry for TaskContext { fn expr_planners(&self) -> Vec> { vec![] } + + fn udafs(&self) -> HashSet { + self.aggregate_functions.keys().cloned().collect() + } + + fn udwfs(&self) -> HashSet { + self.window_functions.keys().cloned().collect() + } } #[cfg(test)] diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index 14717dd781..db85f32079 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -19,6 +19,7 @@ name = "datafusion-expr-common" description = "Logical plan and expression representation for DataFusion query engine" keywords = ["datafusion", "logical", "plan", "expressions"] +readme = "README.md" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } diff --git a/datafusion/expr-common/README.md b/datafusion/expr-common/README.md index 5f95627ca0..9700670254 100644 --- a/datafusion/expr-common/README.md +++ b/datafusion/expr-common/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Logical Plan and Expressions +# Apache DataFusion Common Logical Plan and Expressions -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides common logical expressions @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/expr-common/src/casts.rs b/datafusion/expr-common/src/casts.rs index c31d4f77c6..8939ff1371 100644 --- a/datafusion/expr-common/src/casts.rs +++ b/datafusion/expr-common/src/casts.rs @@ -25,7 +25,9 @@ use std::cmp::Ordering; use arrow::datatypes::{ DataType, TimeUnit, MAX_DECIMAL128_FOR_EACH_PRECISION, - MIN_DECIMAL128_FOR_EACH_PRECISION, + MAX_DECIMAL32_FOR_EACH_PRECISION, MAX_DECIMAL64_FOR_EACH_PRECISION, + MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL32_FOR_EACH_PRECISION, + MIN_DECIMAL64_FOR_EACH_PRECISION, }; use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS}; use datafusion_common::ScalarValue; @@ -69,6 +71,8 @@ fn is_supported_numeric_type(data_type: &DataType) -> bool { | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Timestamp(_, _) ) @@ -114,6 +118,8 @@ fn try_cast_numeric_literal( | DataType::Int32 | DataType::Int64 => 1_i128, DataType::Timestamp(_, _) => 1_i128, + DataType::Decimal32(_, scale) => 10_i128.pow(*scale as u32), + DataType::Decimal64(_, scale) => 10_i128.pow(*scale as u32), DataType::Decimal128(_, scale) => 10_i128.pow(*scale as u32), _ => return None, }; @@ -127,6 +133,20 @@ fn try_cast_numeric_literal( DataType::Int32 => (i32::MIN as i128, i32::MAX as i128), DataType::Int64 => (i64::MIN as i128, i64::MAX as i128), DataType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128), + DataType::Decimal32(precision, _) => ( + // Different precision for decimal32 can store different range of value. + // For example, the precision is 3, the max of value is `999` and the min + // value is `-999` + MIN_DECIMAL32_FOR_EACH_PRECISION[*precision as usize] as i128, + MAX_DECIMAL32_FOR_EACH_PRECISION[*precision as usize] as i128, + ), + DataType::Decimal64(precision, _) => ( + // Different precision for decimal64 can store different range of value. + // For example, the precision is 3, the max of value is `999` and the min + // value is `-999` + MIN_DECIMAL64_FOR_EACH_PRECISION[*precision as usize] as i128, + MAX_DECIMAL64_FOR_EACH_PRECISION[*precision as usize] as i128, + ), DataType::Decimal128(precision, _) => ( // Different precision for decimal128 can store different range of value. // For example, the precision is 3, the max of value is `999` and the min @@ -149,6 +169,46 @@ fn try_cast_numeric_literal( ScalarValue::TimestampMillisecond(Some(v), _) => (*v as i128).checked_mul(mul), ScalarValue::TimestampMicrosecond(Some(v), _) => (*v as i128).checked_mul(mul), ScalarValue::TimestampNanosecond(Some(v), _) => (*v as i128).checked_mul(mul), + ScalarValue::Decimal32(Some(v), _, scale) => { + let v = *v as i128; + let lit_scale_mul = 10_i128.pow(*scale as u32); + if mul >= lit_scale_mul { + // Example: + // lit is decimal(123,3,2) + // target type is decimal(5,3) + // the lit can be converted to the decimal(1230,5,3) + v.checked_mul(mul / lit_scale_mul) + } else if v % (lit_scale_mul / mul) == 0 { + // Example: + // lit is decimal(123000,10,3) + // target type is int32: the lit can be converted to INT32(123) + // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2) + Some(v / (lit_scale_mul / mul)) + } else { + // can't convert the lit decimal to the target data type + None + } + } + ScalarValue::Decimal64(Some(v), _, scale) => { + let v = *v as i128; + let lit_scale_mul = 10_i128.pow(*scale as u32); + if mul >= lit_scale_mul { + // Example: + // lit is decimal(123,3,2) + // target type is decimal(5,3) + // the lit can be converted to the decimal(1230,5,3) + v.checked_mul(mul / lit_scale_mul) + } else if v % (lit_scale_mul / mul) == 0 { + // Example: + // lit is decimal(123000,10,3) + // target type is int32: the lit can be converted to INT32(123) + // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2) + Some(v / (lit_scale_mul / mul)) + } else { + // can't convert the lit decimal to the target data type + None + } + } ScalarValue::Decimal128(Some(v), _, scale) => { let lit_scale_mul = 10_i128.pow(*scale as u32); if mul >= lit_scale_mul { @@ -218,6 +278,12 @@ fn try_cast_numeric_literal( ); ScalarValue::TimestampNanosecond(value, tz.clone()) } + DataType::Decimal32(p, s) => { + ScalarValue::Decimal32(Some(value as i32), *p, *s) + } + DataType::Decimal64(p, s) => { + ScalarValue::Decimal64(Some(value as i64), *p, *s) + } DataType::Decimal128(p, s) => { ScalarValue::Decimal128(Some(value), *p, *s) } @@ -339,7 +405,7 @@ mod tests { let actual_value = try_cast_literal_to_type(&literal, &target_type); println!("expect_cast: "); - println!(" {literal:?} --> {target_type:?}"); + println!(" {literal:?} --> {target_type}"); println!(" expected_result: {expected_result:?}"); println!(" actual_result: {actual_value:?}"); diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 6820b933f2..9ed39806fe 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -266,6 +266,8 @@ pub enum TypeSignatureClass { // TODO: // Numeric Integer, + /// Encompasses both the native Binary as well as arbitrarily sized FixedSizeBinary types + Binary, } impl Display for TypeSignatureClass { @@ -303,6 +305,9 @@ impl TypeSignatureClass { TypeSignatureClass::Integer => { vec![DataType::Int64] } + TypeSignatureClass::Binary => { + vec![DataType::Binary] + } } } @@ -322,6 +327,7 @@ impl TypeSignatureClass { TypeSignatureClass::Interval if logical_type.is_interval() => true, TypeSignatureClass::Duration if logical_type.is_duration() => true, TypeSignatureClass::Integer if logical_type.is_integer() => true, + TypeSignatureClass::Binary if logical_type.is_binary() => true, _ => false, } } @@ -352,6 +358,9 @@ impl TypeSignatureClass { TypeSignatureClass::Integer if native_type.is_integer() => { Ok(origin_type.to_owned()) } + TypeSignatureClass::Binary if native_type.is_binary() => { + Ok(origin_type.to_owned()) + } _ => internal_err!("May miss the matching logic in `matches_native_type`"), } } @@ -960,7 +969,7 @@ impl Signature { ArrayFunctionArgument::Array, ArrayFunctionArgument::Element, ], - array_coercion: None, + array_coercion: Some(ListCoercion::FixedSizedListToList), }), TypeSignature::ArraySignature(ArrayFunctionSignature::Array { arguments: vec![ @@ -968,7 +977,7 @@ impl Signature { ArrayFunctionArgument::Element, ArrayFunctionArgument::Index, ], - array_coercion: None, + array_coercion: Some(ListCoercion::FixedSizedListToList), }), ]), volatility, diff --git a/datafusion/expr-common/src/statistics.rs b/datafusion/expr-common/src/statistics.rs index 14f2f331ef..5c5e397e74 100644 --- a/datafusion/expr-common/src/statistics.rs +++ b/datafusion/expr-common/src/statistics.rs @@ -189,7 +189,7 @@ impl Distribution { pub fn target_type(args: &[&ScalarValue]) -> Result { let mut arg_types = args .iter() - .filter(|&&arg| (arg != &ScalarValue::Null)) + .filter(|&&arg| arg != &ScalarValue::Null) .map(|&arg| arg.data_type()); let Some(dt) = arg_types.next().map_or_else( diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs index e9377ce7de..e77a072a84 100644 --- a/datafusion/expr-common/src/type_coercion/aggregates.rs +++ b/datafusion/expr-common/src/type_coercion/aggregates.rs @@ -18,7 +18,8 @@ use crate::signature::TypeSignature; use arrow::datatypes::{ DataType, FieldRef, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, - DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, + DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL32_MAX_PRECISION, + DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, }; use datafusion_common::{internal_err, plan_err, Result}; @@ -150,6 +151,18 @@ pub fn sum_return_type(arg_type: &DataType) -> Result { DataType::Int64 => Ok(DataType::Int64), DataType::UInt64 => Ok(DataType::UInt64), DataType::Float64 => Ok(DataType::Float64), + DataType::Decimal32(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal32(new_precision, *scale)) + } + DataType::Decimal64(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal64(new_precision, *scale)) + } DataType::Decimal128(precision, scale) => { // In the spark, the result type is DECIMAL(min(38,precision+10), s) // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 @@ -171,7 +184,7 @@ pub fn variance_return_type(arg_type: &DataType) -> Result { if NUMERICS.contains(arg_type) { Ok(DataType::Float64) } else { - plan_err!("VAR does not support {arg_type:?}") + plan_err!("VAR does not support {arg_type}") } } @@ -180,7 +193,7 @@ pub fn covariance_return_type(arg_type: &DataType) -> Result { if NUMERICS.contains(arg_type) { Ok(DataType::Float64) } else { - plan_err!("COVAR does not support {arg_type:?}") + plan_err!("COVAR does not support {arg_type}") } } @@ -189,13 +202,27 @@ pub fn correlation_return_type(arg_type: &DataType) -> Result { if NUMERICS.contains(arg_type) { Ok(DataType::Float64) } else { - plan_err!("CORR does not support {arg_type:?}") + plan_err!("CORR does not support {arg_type}") } } /// Function return type of an average pub fn avg_return_type(func_name: &str, arg_type: &DataType) -> Result { match arg_type { + DataType::Decimal32(precision, scale) => { + // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)). + // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66 + let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 4); + let new_scale = DECIMAL32_MAX_SCALE.min(*scale + 4); + Ok(DataType::Decimal32(new_precision, new_scale)) + } + DataType::Decimal64(precision, scale) => { + // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)). + // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66 + let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 4); + let new_scale = DECIMAL64_MAX_SCALE.min(*scale + 4); + Ok(DataType::Decimal64(new_precision, new_scale)) + } DataType::Decimal128(precision, scale) => { // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)). // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66 @@ -222,6 +249,16 @@ pub fn avg_return_type(func_name: &str, arg_type: &DataType) -> Result /// Internal sum type of an average pub fn avg_sum_type(arg_type: &DataType) -> Result { match arg_type { + DataType::Decimal32(precision, scale) => { + // In the spark, the sum type of avg is DECIMAL(min(38,precision+10), s) + let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal32(new_precision, *scale)) + } + DataType::Decimal64(precision, scale) => { + // In the spark, the sum type of avg is DECIMAL(min(38,precision+10), s) + let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal64(new_precision, *scale)) + } DataType::Decimal128(precision, scale) => { // In the spark, the sum type of avg is DECIMAL(min(38,precision+10), s) let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10); @@ -249,7 +286,7 @@ pub fn is_sum_support_arg_type(arg_type: &DataType) -> bool { _ => matches!( arg_type, arg_type if NUMERICS.contains(arg_type) - || matches!(arg_type, DataType::Decimal128(_, _) | DataType::Decimal256(_, _)) + || matches!(arg_type, DataType::Decimal32(_, _) | DataType::Decimal64(_, _) |DataType::Decimal128(_, _) | DataType::Decimal256(_, _)) ), } } @@ -262,7 +299,7 @@ pub fn is_avg_support_arg_type(arg_type: &DataType) -> bool { _ => matches!( arg_type, arg_type if NUMERICS.contains(arg_type) - || matches!(arg_type, DataType::Decimal128(_, _)| DataType::Decimal256(_, _)) + || matches!(arg_type, DataType::Decimal32(_, _) | DataType::Decimal64(_, _) |DataType::Decimal128(_, _) | DataType::Decimal256(_, _)) ), } } @@ -297,6 +334,8 @@ pub fn coerce_avg_type(func_name: &str, arg_types: &[DataType]) -> Result Result { match &data_type { + DataType::Decimal32(p, s) => Ok(DataType::Decimal32(*p, *s)), + DataType::Decimal64(p, s) => Ok(DataType::Decimal64(*p, *s)), DataType::Decimal128(p, s) => Ok(DataType::Decimal128(*p, *s)), DataType::Decimal256(p, s) => Ok(DataType::Decimal256(*p, *s)), d if d.is_numeric() => Ok(DataType::Float64), @@ -304,7 +343,7 @@ pub fn coerce_avg_type(func_name: &str, arg_types: &[DataType]) -> Result coerced_type(func_name, v.as_ref()), _ => { plan_err!( - "The function {:?} does not support inputs of type {:?}.", + "The function {:?} does not support inputs of type {}.", func_name, data_type ) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index f344a71451..1c99f49d26 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -27,6 +27,8 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{ DataType, Field, FieldRef, Fields, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, + DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION, + DECIMAL64_MAX_SCALE, }; use datafusion_common::types::NativeType; use datafusion_common::{ @@ -204,7 +206,7 @@ impl<'a> BinaryTypeCoercer<'a> { } And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) { // Logical binary boolean operators can only be evaluated for - // boolean or null arguments. + // boolean or null arguments. Ok(Signature::uniform(Boolean)) } else { plan_err!( @@ -341,22 +343,64 @@ fn math_decimal_coercion( let (lhs_type, value_type) = math_decimal_coercion(lhs_type, value_type)?; Some((lhs_type, value_type)) } - (Null, dec_type @ Decimal128(_, _)) | (dec_type @ Decimal128(_, _), Null) => { - Some((dec_type.clone(), dec_type.clone())) - } - (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { + ( + Null, + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + ) => Some((rhs_type.clone(), rhs_type.clone())), + ( + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + Null, + ) => Some((lhs_type.clone(), lhs_type.clone())), + (Decimal32(_, _), Decimal32(_, _)) + | (Decimal64(_, _), Decimal64(_, _)) + | (Decimal128(_, _), Decimal128(_, _)) + | (Decimal256(_, _), Decimal256(_, _)) => { Some((lhs_type.clone(), rhs_type.clone())) } // Unlike with comparison we don't coerce to a decimal in the case of floating point // numbers, instead falling back to floating point arithmetic instead + ( + Decimal32(_, _), + Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, + ) => Some(( + lhs_type.clone(), + coerce_numeric_type_to_decimal32(rhs_type)?, + )), + ( + Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, + Decimal32(_, _), + ) => Some(( + coerce_numeric_type_to_decimal32(lhs_type)?, + rhs_type.clone(), + )), + ( + Decimal64(_, _), + Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, + ) => Some(( + lhs_type.clone(), + coerce_numeric_type_to_decimal64(rhs_type)?, + )), + ( + Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, + Decimal64(_, _), + ) => Some(( + coerce_numeric_type_to_decimal64(lhs_type)?, + rhs_type.clone(), + )), ( Decimal128(_, _), Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, - ) => Some((lhs_type.clone(), coerce_numeric_type_to_decimal(rhs_type)?)), + ) => Some(( + lhs_type.clone(), + coerce_numeric_type_to_decimal128(rhs_type)?, + )), ( Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _), - ) => Some((coerce_numeric_type_to_decimal(lhs_type)?, rhs_type.clone())), + ) => Some(( + coerce_numeric_type_to_decimal128(lhs_type)?, + rhs_type.clone(), + )), ( Decimal256(_, _), Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, @@ -694,7 +738,7 @@ pub fn try_type_union_resolution_with_struct( keys_string = Some(keys); } } else { - return exec_err!("Expect to get struct but got {}", data_type); + return exec_err!("Expect to get struct but got {data_type}"); } } @@ -726,7 +770,7 @@ pub fn try_type_union_resolution_with_struct( } } } else { - return exec_err!("Expect to get struct but got {}", data_type); + return exec_err!("Expect to get struct but got {data_type}"); } } @@ -932,8 +976,8 @@ fn get_common_decimal_type( ) -> Option { use arrow::datatypes::DataType::*; match decimal_type { - Decimal128(_, _) => { - let other_decimal_type = coerce_numeric_type_to_decimal(other_type)?; + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) => { + let other_decimal_type = coerce_numeric_type_to_decimal128(other_type)?; get_wider_decimal_type(decimal_type, &other_decimal_type) } Decimal256(_, _) => { @@ -953,11 +997,23 @@ fn get_wider_decimal_type( rhs_type: &DataType, ) -> Option { match (lhs_decimal_type, rhs_type) { + (DataType::Decimal32(p1, s1), DataType::Decimal32(p2, s2)) => { + // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) + let s = *s1.max(s2); + let range = (*p1 as i8 - s1).max(*p2 as i8 - s2); + Some(create_decimal32_type((range + s) as u8, s)) + } + (DataType::Decimal64(p1, s1), DataType::Decimal64(p2, s2)) => { + // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) + let s = *s1.max(s2); + let range = (*p1 as i8 - s1).max(*p2 as i8 - s2); + Some(create_decimal64_type((range + s) as u8, s)) + } (DataType::Decimal128(p1, s1), DataType::Decimal128(p2, s2)) => { // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) let s = *s1.max(s2); let range = (*p1 as i8 - s1).max(*p2 as i8 - s2); - Some(create_decimal_type((range + s) as u8, s)) + Some(create_decimal128_type((range + s) as u8, s)) } (DataType::Decimal256(p1, s1), DataType::Decimal256(p2, s2)) => { // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) @@ -971,7 +1027,39 @@ fn get_wider_decimal_type( /// Convert the numeric data type to the decimal data type. /// We support signed and unsigned integer types and floating-point type. -fn coerce_numeric_type_to_decimal(numeric_type: &DataType) -> Option { +fn coerce_numeric_type_to_decimal32(numeric_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + // This conversion rule is from spark + // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127 + match numeric_type { + Int8 | UInt8 => Some(Decimal32(3, 0)), + Int16 | UInt16 => Some(Decimal32(5, 0)), + // TODO if we convert the floating-point data to the decimal type, it maybe overflow. + Float16 => Some(Decimal32(6, 3)), + _ => None, + } +} + +/// Convert the numeric data type to the decimal data type. +/// We support signed and unsigned integer types and floating-point type. +fn coerce_numeric_type_to_decimal64(numeric_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + // This conversion rule is from spark + // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127 + match numeric_type { + Int8 | UInt8 => Some(Decimal64(3, 0)), + Int16 | UInt16 => Some(Decimal64(5, 0)), + Int32 | UInt32 => Some(Decimal64(10, 0)), + // TODO if we convert the floating-point data to the decimal type, it maybe overflow. + Float16 => Some(Decimal64(6, 3)), + Float32 => Some(Decimal64(14, 7)), + _ => None, + } +} + +/// Convert the numeric data type to the decimal data type. +/// We support signed and unsigned integer types and floating-point type. +fn coerce_numeric_type_to_decimal128(numeric_type: &DataType) -> Option { use arrow::datatypes::DataType::*; // This conversion rule is from spark // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127 @@ -1120,7 +1208,21 @@ fn numerical_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option DataType { +fn create_decimal32_type(precision: u8, scale: i8) -> DataType { + DataType::Decimal128( + DECIMAL32_MAX_PRECISION.min(precision), + DECIMAL32_MAX_SCALE.min(scale), + ) +} + +fn create_decimal64_type(precision: u8, scale: i8) -> DataType { + DataType::Decimal128( + DECIMAL64_MAX_PRECISION.min(precision), + DECIMAL64_MAX_SCALE.min(scale), + ) +} + +fn create_decimal128_type(precision: u8, scale: i8) -> DataType { DataType::Decimal128( DECIMAL128_MAX_PRECISION.min(precision), DECIMAL128_MAX_SCALE.min(scale), diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs index fdd41ae2bb..e6238ba007 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs @@ -56,32 +56,75 @@ fn test_date_timestamp_arithmetic_error() -> Result<()> { #[test] fn test_decimal_mathematics_op_type() { + // Decimal32 assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Int8).unwrap(), + coerce_numeric_type_to_decimal32(&DataType::Int8).unwrap(), + DataType::Decimal32(3, 0) + ); + assert_eq!( + coerce_numeric_type_to_decimal32(&DataType::Int16).unwrap(), + DataType::Decimal32(5, 0) + ); + assert!(coerce_numeric_type_to_decimal32(&DataType::Int32).is_none()); + assert!(coerce_numeric_type_to_decimal32(&DataType::Int64).is_none(),); + assert_eq!( + coerce_numeric_type_to_decimal32(&DataType::Float16).unwrap(), + DataType::Decimal32(6, 3) + ); + assert!(coerce_numeric_type_to_decimal32(&DataType::Float32).is_none(),); + assert!(coerce_numeric_type_to_decimal32(&DataType::Float64).is_none()); + + // Decimal64 + assert_eq!( + coerce_numeric_type_to_decimal64(&DataType::Int8).unwrap(), + DataType::Decimal64(3, 0) + ); + assert_eq!( + coerce_numeric_type_to_decimal64(&DataType::Int16).unwrap(), + DataType::Decimal64(5, 0) + ); + assert_eq!( + coerce_numeric_type_to_decimal64(&DataType::Int32).unwrap(), + DataType::Decimal64(10, 0) + ); + assert!(coerce_numeric_type_to_decimal64(&DataType::Int64).is_none(),); + assert_eq!( + coerce_numeric_type_to_decimal64(&DataType::Float16).unwrap(), + DataType::Decimal64(6, 3) + ); + assert_eq!( + coerce_numeric_type_to_decimal64(&DataType::Float32).unwrap(), + DataType::Decimal64(14, 7) + ); + assert!(coerce_numeric_type_to_decimal64(&DataType::Float64).is_none()); + + // Decimal128 + assert_eq!( + coerce_numeric_type_to_decimal128(&DataType::Int8).unwrap(), DataType::Decimal128(3, 0) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Int16).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Int16).unwrap(), DataType::Decimal128(5, 0) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Int32).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Int32).unwrap(), DataType::Decimal128(10, 0) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Int64).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Int64).unwrap(), DataType::Decimal128(20, 0) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Float16).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Float16).unwrap(), DataType::Decimal128(6, 3) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Float32).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Float32).unwrap(), DataType::Decimal128(14, 7) ); assert_eq!( - coerce_numeric_type_to_decimal(&DataType::Float64).unwrap(), + coerce_numeric_type_to_decimal128(&DataType::Float64).unwrap(), DataType::Decimal128(30, 15) ); } diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 2a8e5ecfb1..e6b2734cff 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -53,7 +53,7 @@ datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } indexmap = { workspace = true } -itertools = "0.14.0" +itertools = { workspace = true } paste = "^1.0" recursive = { workspace = true, optional = true } serde_json = { workspace = true } diff --git a/datafusion/expr/README.md b/datafusion/expr/README.md index 860c36769e..b3ab9a383d 100644 --- a/datafusion/expr/README.md +++ b/datafusion/expr/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Logical Plan and Expressions +# Apache DataFusion Logical Plan and Expressions -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides data types and utilities for logical plans and expressions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs index 69525ea521..8910b2e9b5 100644 --- a/datafusion/expr/src/conditional_expressions.rs +++ b/datafusion/expr/src/conditional_expressions.rs @@ -20,6 +20,7 @@ use crate::expr::Case; use crate::{expr_schema::ExprSchemable, Expr}; use arrow::datatypes::DataType; use datafusion_common::{plan_err, DFSchema, HashSet, Result}; +use itertools::Itertools as _; /// Helper struct for building [Expr::Case] pub struct CaseBuilder { @@ -81,9 +82,12 @@ impl CaseBuilder { // Cannot verify types until execution type } else { let unique_types: HashSet<&DataType> = then_types.iter().collect(); - if unique_types.len() != 1 { + if unique_types.is_empty() { + return plan_err!("CASE expression 'then' values had no data types"); + } else if unique_types.len() != 1 { return plan_err!( - "CASE expression 'then' values had multiple data types: {unique_types:?}" + "CASE expression 'then' values had multiple data types: {}", + unique_types.iter().join(", ") ); } } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 53db2dd7ac..406a1abe42 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -3482,10 +3482,10 @@ impl Display for Expr { write!(f, "END") } Expr::Cast(Cast { expr, data_type }) => { - write!(f, "CAST({expr} AS {data_type:?})") + write!(f, "CAST({expr} AS {data_type})") } Expr::TryCast(TryCast { expr, data_type }) => { - write!(f, "TRY_CAST({expr} AS {data_type:?})") + write!(f, "TRY_CAST({expr} AS {data_type})") } Expr::Not(expr) => write!(f, "NOT {expr}"), Expr::Negative(expr) => write!(f, "(- {expr})"), diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index d0988514eb..e803e35341 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -482,7 +482,7 @@ impl ExprSchemable for Expr { Ok(Arc::new(Field::new(&schema_name, DataType::Boolean, false))) } Expr::ScalarSubquery(subquery) => { - Ok(Arc::new(subquery.subquery.schema().field(0).clone())) + Ok(Arc::clone(&subquery.subquery.schema().fields()[0])) } Expr::BinaryExpr(BinaryExpr { ref left, @@ -642,7 +642,7 @@ impl ExprSchemable for Expr { _ => Ok(Expr::Cast(Cast::new(Box::new(self), cast_to_type.clone()))), } } else { - plan_err!("Cannot automatically convert {this_type:?} to {cast_to_type:?}") + plan_err!("Cannot automatically convert {this_type} to {cast_to_type}") } } } diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 929d0664b4..d525ade2c3 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -79,7 +79,10 @@ pub mod var_provider; pub mod window_frame; pub mod window_state; -pub use datafusion_doc::{DocSection, Documentation, DocumentationBuilder}; +pub use datafusion_doc::{ + aggregate_doc_sections, scalar_doc_sections, window_doc_sections, DocSection, + Documentation, DocumentationBuilder, +}; pub use datafusion_expr_common::accumulator::Accumulator; pub use datafusion_expr_common::columnar_value::ColumnarValue; pub use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator}; @@ -108,15 +111,13 @@ pub use partition_evaluator::PartitionEvaluator; pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; pub use udaf::{ - aggregate_doc_sections, udaf_default_display_name, udaf_default_human_display, - udaf_default_return_field, udaf_default_schema_name, - udaf_default_window_function_display_name, udaf_default_window_function_schema_name, - AggregateUDF, AggregateUDFImpl, ReversedUDAF, SetMonotonicity, StatisticsArgs, -}; -pub use udf::{ - scalar_doc_sections, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, + udaf_default_display_name, udaf_default_human_display, udaf_default_return_field, + udaf_default_schema_name, udaf_default_window_function_display_name, + udaf_default_window_function_schema_name, AggregateUDF, AggregateUDFImpl, + ReversedUDAF, SetMonotonicity, StatisticsArgs, }; -pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; +pub use udf::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl}; +pub use udwf::{ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; #[cfg(test)] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 511d8c27a5..7b57bce105 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -18,13 +18,14 @@ //! This module provides a builder for creating LogicalPlans use std::any::Any; +use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::iter::once; use std::sync::Arc; use crate::dml::CopyTo; -use crate::expr::{Alias, PlannedReplaceSelectItem, Sort as SortExpr}; +use crate::expr::{Alias, FieldMetadata, PlannedReplaceSelectItem, Sort as SortExpr}; use crate::expr_rewriter::{ coerce_plan_expr_for_schema, normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, normalize_sorts, @@ -281,15 +282,14 @@ impl LogicalPlanBuilder { let value = &row[j]; let data_type = value.get_type(schema)?; - if !data_type.equals_datatype(field_type) { - if can_cast_types(&data_type, field_type) { - } else { - return exec_err!( - "type mismatch and can't cast to got {} and {}", - data_type, - field_type - ); - } + if !data_type.equals_datatype(field_type) + && !can_cast_types(&data_type, field_type) + { + return exec_err!( + "type mismatch and can't cast to got {} and {}", + data_type, + field_type + ); } } fields.push(field_type.to_owned(), field_nullable); @@ -305,8 +305,17 @@ impl LogicalPlanBuilder { for j in 0..n_cols { let mut common_type: Option = None; + let mut common_metadata: Option = None; for (i, row) in values.iter().enumerate() { let value = &row[j]; + let metadata = value.metadata(&schema)?; + if let Some(ref cm) = common_metadata { + if &metadata != cm { + return plan_err!("Inconsistent metadata across values list at row {i} column {j}. Was {:?} but found {:?}", cm, metadata); + } + } else { + common_metadata = Some(metadata.clone()); + } let data_type = value.get_type(&schema)?; if data_type == DataType::Null { continue; @@ -325,7 +334,11 @@ impl LogicalPlanBuilder { } // assuming common_type was not set, and no error, therefore the type should be NULL // since the code loop skips NULL - fields.push(common_type.unwrap_or(DataType::Null), true); + fields.push_with_metadata( + common_type.unwrap_or(DataType::Null), + true, + common_metadata, + ); } Self::infer_inner(values, fields, &schema) @@ -1506,10 +1519,23 @@ impl ValuesFields { } pub fn push(&mut self, data_type: DataType, nullable: bool) { + self.push_with_metadata(data_type, nullable, None); + } + + pub fn push_with_metadata( + &mut self, + data_type: DataType, + nullable: bool, + metadata: Option, + ) { // Naming follows the convention described here: // https://www.postgresql.org/docs/current/queries-values.html let name = format!("column{}", self.inner.len() + 1); - self.inner.push(Field::new(name, data_type, nullable)); + let mut field = Field::new(name, data_type, nullable); + if let Some(metadata) = metadata { + field.set_metadata(metadata.to_hashmap()); + } + self.inner.push(field); } pub fn into_fields(self) -> Fields { @@ -1517,37 +1543,49 @@ impl ValuesFields { } } -// `name_map` tracks a mapping between a field name and the number of appearances of that field. -// -// Some field names might already come to this function with the count (number of times it appeared) -// as a suffix e.g. id:1, so there's still a chance of name collisions, for example, -// if these three fields passed to this function: "col:1", "col" and "col", the function -// would rename them to -> col:1, col, col:1 causing a posteriror error when building the DFSchema. -// that's why we need the `seen` set, so the fields are always unique. -// -pub fn change_redundant_column(fields: &Fields) -> Vec { - let mut name_map = HashMap::new(); - let mut seen: HashSet = HashSet::new(); +/// Returns aliases to make field names unique. +/// +/// Returns a vector of optional aliases, one per input field. `None` means keep the original name, +/// `Some(alias)` means rename to the alias to ensure uniqueness. +/// +/// Used when creating [`SubqueryAlias`] or similar operations that strip table qualifiers but need +/// to maintain unique column names. +/// +/// # Example +/// Input fields: `[a, a, b, b, a, a:1]` ([`DFSchema`] valid when duplicate fields have different qualifiers) +/// Returns: `[None, Some("a:1"), None, Some("b:1"), Some("a:2"), Some("a:1:1")]` +pub fn unique_field_aliases(fields: &Fields) -> Vec> { + // Some field names might already come to this function with the count (number of times it appeared) + // as a suffix e.g. id:1, so there's still a chance of name collisions, for example, + // if these three fields passed to this function: "col:1", "col" and "col", the function + // would rename them to -> col:1, col, col:1 causing a posterior error when building the DFSchema. + // That's why we need the `seen` set, so the fields are always unique. + + // Tracks a mapping between a field name and the number of appearances of that field. + let mut name_map = HashMap::<&str, usize>::new(); + // Tracks all the fields and aliases that were previously seen. + let mut seen = HashSet::>::new(); fields - .into_iter() + .iter() .map(|field| { - let base_name = field.name(); - let count = name_map.entry(base_name.clone()).or_insert(0); - let mut new_name = base_name.clone(); + let original_name = field.name(); + let mut name = Cow::Borrowed(original_name); + + let count = name_map.entry(original_name).or_insert(0); - // Loop until we find a name that hasn't been used - while seen.contains(&new_name) { + // Loop until we find a name that hasn't been used. + while seen.contains(&name) { *count += 1; - new_name = format!("{base_name}:{count}"); + name = Cow::Owned(format!("{original_name}:{count}")); } - seen.insert(new_name.clone()); + seen.insert(name.clone()); - let mut modified_field = - Field::new(&new_name, field.data_type().clone(), field.is_nullable()); - modified_field.set_metadata(field.metadata().clone()); - modified_field + match name { + Cow::Borrowed(_) => None, + Cow::Owned(alias) => Some(alias), + } }) .collect() } @@ -1957,6 +1995,7 @@ pub fn table_scan_with_filter_and_fetch( } pub fn table_source(table_schema: &Schema) -> Arc { + // TODO should we take SchemaRef and avoid cloning? let table_schema = Arc::new(table_schema.clone()); Arc::new(LogicalTableSource { table_schema, @@ -1968,6 +2007,7 @@ pub fn table_source_with_constraints( table_schema: &Schema, constraints: Constraints, ) -> Arc { + // TODO should we take SchemaRef and avoid cloning? let table_schema = Arc::new(table_schema.clone()); Arc::new(LogicalTableSource { table_schema, @@ -2140,7 +2180,10 @@ pub fn unnest_with_options( #[cfg(test)] mod tests { + use std::vec; + use super::*; + use crate::lit_with_metadata; use crate::logical_plan::StringifiedPlan; use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery}; @@ -2675,34 +2718,6 @@ mod tests { Ok(()) } - #[test] - fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_3 = Field::new("a", DataType::Int32, false); - let t2_field_4 = Field::new("a:1", DataType::Int32, false); - let t1_field_2 = Field::new("b", DataType::Int32, false); - let t2_field_2 = Field::new("b", DataType::Int32, false); - - let field_vec = vec![ - t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4, - ]; - let remove_redundant = change_redundant_column(&Fields::from(field_vec)); - - assert_eq!( - remove_redundant, - vec![ - Field::new("a", DataType::Int32, false), - Field::new("a:1", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - Field::new("b:1", DataType::Int32, false), - Field::new("a:2", DataType::Int32, false), - Field::new("a:1:1", DataType::Int32, false), - ] - ); - Ok(()) - } - #[test] fn plan_builder_from_logical_plan() -> Result<()> { let plan = @@ -2787,4 +2802,68 @@ mod tests { Ok(()) } + + #[test] + fn test_values_metadata() -> Result<()> { + let metadata: HashMap = + [("ARROW:extension:metadata".to_string(), "test".to_string())] + .into_iter() + .collect(); + let metadata = FieldMetadata::from(metadata); + let values = LogicalPlanBuilder::values(vec![ + vec![lit_with_metadata(1, Some(metadata.clone()))], + vec![lit_with_metadata(2, Some(metadata.clone()))], + ])? + .build()?; + assert_eq!(*values.schema().field(0).metadata(), metadata.to_hashmap()); + + // Do not allow VALUES with different metadata mixed together + let metadata2: HashMap = + [("ARROW:extension:metadata".to_string(), "test2".to_string())] + .into_iter() + .collect(); + let metadata2 = FieldMetadata::from(metadata2); + assert!(LogicalPlanBuilder::values(vec![ + vec![lit_with_metadata(1, Some(metadata.clone()))], + vec![lit_with_metadata(2, Some(metadata2.clone()))], + ]) + .is_err()); + + Ok(()) + } + + #[test] + fn test_unique_field_aliases() { + let t1_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_3 = Field::new("a", DataType::Int32, false); + let t2_field_4 = Field::new("a:1", DataType::Int32, false); + let t1_field_2 = Field::new("b", DataType::Int32, false); + let t2_field_2 = Field::new("b", DataType::Int32, false); + + let fields = vec![ + t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4, + ]; + let fields = Fields::from(fields); + + let remove_redundant = unique_field_aliases(&fields); + + // Input [a, a, b, b, a, a:1] becomes [None, a:1, None, b:1, a:2, a:1:1] + // First occurrence of each field name keeps original name (None), duplicates get + // incremental suffixes (:1, :2, etc.). + // Crucially in this case the 2nd occurrence of `a` gets rewritten to `a:1` which later + // conflicts with the last column which is _actually_ called `a:1` so we need to rename it + // as well to `a:1:1`. + assert_eq!( + remove_redundant, + vec![ + None, + Some("a:1".to_string()), + None, + Some("b:1".to_string()), + Some("a:2".to_string()), + Some("a:1:1".to_string()), + ] + ); + } } diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 9953112910..57e7d41cba 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -216,6 +216,8 @@ pub struct CreateExternalTable { pub table_partition_cols: Vec, /// Option to not error if table already exists pub if_not_exists: bool, + /// Option to replace table content if table already exists + pub or_replace: bool, /// Whether the table is a temporary table pub temporary: bool, /// SQL used to create the table, if available diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index cc3fbad7b0..174ab28a1e 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -72,11 +72,7 @@ impl<'n> TreeNodeVisitor<'n> for IndentVisitor<'_, '_> { write!(self.f, "{:indent$}", "", indent = self.indent * 2)?; write!(self.f, "{}", plan.display())?; if self.with_schema { - write!( - self.f, - " {}", - display_schema(&plan.schema().as_ref().to_owned().into()) - )?; + write!(self.f, " {}", display_schema(plan.schema().as_arrow()))?; } self.indent += 1; @@ -196,7 +192,7 @@ impl<'n> TreeNodeVisitor<'n> for GraphvizVisitor<'_, '_> { format!( r"{}\nSchema: {}", plan.display(), - display_schema(&plan.schema().as_ref().to_owned().into()) + display_schema(plan.schema().as_arrow()) ) } else { format!("{}", plan.display()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 7dc750a35c..3cc0322774 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -29,9 +29,9 @@ use super::invariants::{ InvariantLevel, }; use super::DdlStatement; -use crate::builder::{change_redundant_column, unnest_with_options}; +use crate::builder::{unique_field_aliases, unnest_with_options}; use crate::expr::{ - intersect_metadata_for_union, Placeholder, Sort as SortExpr, WindowFunction, + intersect_metadata_for_union, Alias, Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams, }; use crate::expr_rewriter::{ @@ -2239,15 +2239,47 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let fields = change_redundant_column(plan.schema().fields()); - let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = - DFSchema::from_unqualified_fields(fields.into(), meta_data)?.into(); - // Since schema is the same, other than qualifier, we can use existing - // functional dependencies: + + // Since SubqueryAlias will replace all field qualification for the output schema of `plan`, + // no field must share the same column name as this would lead to ambiguity when referencing + // columns in parent logical nodes. + + // Compute unique aliases, if any, for each column of the input's schema. + let aliases = unique_field_aliases(plan.schema().fields()); + let is_projection_needed = aliases.iter().any(Option::is_some); + + // Insert a projection node, if needed, to make sure aliases are applied. + let plan = if is_projection_needed { + let projection_expressions = aliases + .iter() + .zip(plan.schema().iter()) + .map(|(alias, (qualifier, field))| { + let column = + Expr::Column(Column::new(qualifier.cloned(), field.name())); + match alias { + None => column, + Some(alias) => { + Expr::Alias(Alias::new(column, qualifier.cloned(), alias)) + } + } + }) + .collect(); + let projection = Projection::try_new(projection_expressions, plan)?; + Arc::new(LogicalPlan::Projection(projection)) + } else { + plan + }; + + // Requalify fields with the new `alias`. + let fields = plan.schema().fields().clone(); + let meta_data = plan.schema().metadata().clone(); let func_dependencies = plan.schema().functional_dependencies().clone(); + + let schema = DFSchema::from_unqualified_fields(fields, meta_data)?; + let schema = schema.as_arrow(); + let schema = DFSchemaRef::new( - DFSchema::try_from_qualified_schema(alias.clone(), &schema)? + DFSchema::try_from_qualified_schema(alias.clone(), schema)? .with_functional_dependencies(func_dependencies)?, ); Ok(SubqueryAlias { @@ -2670,7 +2702,7 @@ impl TableScan { let df_schema = DFSchema::new_with_metadata( p.iter() .map(|i| { - (Some(table_name.clone()), Arc::new(schema.field(*i).clone())) + (Some(table_name.clone()), Arc::clone(&schema.fields()[*i])) }) .collect(), schema.metadata.clone(), @@ -4155,10 +4187,7 @@ fn get_unnested_columns( })) } _ => { - return internal_err!( - "trying to unnest on invalid data type {:?}", - data_type - ); + return internal_err!("trying to unnest on invalid data type {data_type}"); } }; Ok(qualified_columns) @@ -4182,7 +4211,7 @@ fn get_unnested_list_datatype_recursive( _ => {} }; - internal_err!("trying to unnest on invalid data type {:?}", data_type) + internal_err!("trying to unnest on invalid data type {data_type}") } #[cfg(test)] @@ -5542,7 +5571,7 @@ mod tests { )?; let fields = join.schema.fields(); - assert_eq!(fields.len(), 6, "Expected 6 fields for {join_type:?} join"); + assert_eq!(fields.len(), 6, "Expected 6 fields for {join_type} join"); for (i, field) in fields.iter().enumerate() { let expected_nullable = match (i, &join_type) { diff --git a/datafusion/expr/src/logical_plan/statement.rs b/datafusion/expr/src/logical_plan/statement.rs index 72eb6b39bb..6d3fe9fa75 100644 --- a/datafusion/expr/src/logical_plan/statement.rs +++ b/datafusion/expr/src/logical_plan/statement.rs @@ -17,6 +17,7 @@ use arrow::datatypes::DataType; use datafusion_common::{DFSchema, DFSchemaRef}; +use itertools::Itertools as _; use std::fmt::{self, Display}; use std::sync::{Arc, LazyLock}; @@ -110,7 +111,7 @@ impl Statement { Statement::Prepare(Prepare { name, data_types, .. }) => { - write!(f, "Prepare: {name:?} {data_types:?}") + write!(f, "Prepare: {name:?} [{}]", data_types.iter().join(", ")) } Statement::Execute(Execute { name, parameters, .. diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 4eb49710bc..8ea9e34dac 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -27,9 +27,25 @@ use std::sync::Arc; /// A registry knows how to build logical expressions out of user-defined function' names pub trait FunctionRegistry { - /// Set of all available udfs. + /// Returns names of all available scalar user defined functions. fn udfs(&self) -> HashSet; + /// Returns names of all available aggregate user defined functions. + fn udafs(&self) -> HashSet { + // This default implementation is provided temporarily + // to maintain backward compatibility for the 50.1 release. + // It will be reverted to a required method in future versions. + HashSet::default() + } + + /// Returns names of all available window user defined functions. + fn udwfs(&self) -> HashSet { + // This default implementation is provided temporarily + // to maintain backward compatibility for the 50.1 release. + // It will be reverted to a required method in future versions. + HashSet::default() + } + /// Returns a reference to the user defined scalar function (udf) named /// `name`. fn udf(&self, name: &str) -> Result>; @@ -200,4 +216,12 @@ impl FunctionRegistry for MemoryFunctionRegistry { fn expr_planners(&self) -> Vec> { vec![] } + + fn udafs(&self) -> HashSet { + self.udafs.keys().cloned().collect() + } + + fn udwfs(&self) -> HashSet { + self.udwfs.keys().cloned().collect() + } } diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs index 3feab09bbd..41bc645058 100644 --- a/datafusion/expr/src/test/function_stub.rs +++ b/datafusion/expr/src/test/function_stub.rs @@ -23,6 +23,7 @@ use std::any::Any; use arrow::datatypes::{ DataType, FieldRef, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; use datafusion_common::{exec_err, not_impl_err, utils::take_function_args, Result}; @@ -135,13 +136,14 @@ impl AggregateUDFImpl for Sum { DataType::Dictionary(_, v) => coerced_type(v), // in the spark, the result type is DECIMAL(min(38,precision+10), s) // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - Ok(data_type.clone()) - } + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) => Ok(data_type.clone()), dt if dt.is_signed_integer() => Ok(DataType::Int64), dt if dt.is_unsigned_integer() => Ok(DataType::UInt64), dt if dt.is_floating() => Ok(DataType::Float64), - _ => exec_err!("Sum not supported for {}", data_type), + _ => exec_err!("Sum not supported for {data_type}"), } } @@ -153,6 +155,18 @@ impl AggregateUDFImpl for Sum { DataType::Int64 => Ok(DataType::Int64), DataType::UInt64 => Ok(DataType::UInt64), DataType::Float64 => Ok(DataType::Float64), + DataType::Decimal32(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal32(new_precision, *scale)) + } + DataType::Decimal64(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal64(new_precision, *scale)) + } DataType::Decimal128(precision, scale) => { // in the spark, the result type is DECIMAL(min(38,precision+10), s) // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index d776d07775..bcaff11bcd 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -36,6 +36,7 @@ use datafusion_expr_common::{ type_coercion::binary::comparison_coercion_numeric, type_coercion::binary::string_coercion, }; +use itertools::Itertools as _; use std::sync::Arc; /// Performs type coercion for scalar function arguments. @@ -278,7 +279,8 @@ fn try_coerce_types( // none possible -> Error plan_err!( - "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {current_types:?} to the signature {type_signature:?} failed" + "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {} to the signature {type_signature:?} failed", + current_types.iter().join(", ") ) } @@ -529,7 +531,7 @@ fn get_valid_types( new_types.push(DataType::Utf8); } else { return plan_err!( - "Function '{function_name}' expects NativeType::String but received {logical_data_type}" + "Function '{function_name}' expects NativeType::String but NativeType::received NativeType::{logical_data_type}" ); } } @@ -589,7 +591,7 @@ fn get_valid_types( if !logical_data_type.is_numeric() { return plan_err!( - "Function '{function_name}' expects NativeType::Numeric but received {logical_data_type}" + "Function '{function_name}' expects NativeType::Numeric but received NativeType::{logical_data_type}" ); } @@ -610,7 +612,7 @@ fn get_valid_types( valid_type = DataType::Float64; } else if !logical_data_type.is_numeric() { return plan_err!( - "Function '{function_name}' expects NativeType::Numeric but received {logical_data_type}" + "Function '{function_name}' expects NativeType::Numeric but received NativeType::{logical_data_type}" ); } @@ -657,7 +659,7 @@ fn get_valid_types( new_types.push(casted_type); } else { return internal_err!( - "Expect {} but received {}, DataType: {}", + "Expect {} but received NativeType::{}, DataType: {}", param.desired_type(), current_native_type, current_type @@ -877,7 +879,10 @@ fn coerced_from<'a>( | UInt64 | Float32 | Float64 - | Decimal128(_, _), + | Decimal32(_, _) + | Decimal64(_, _) + | Decimal128(_, _) + | Decimal256(_, _), ) => Some(type_into.clone()), ( Timestamp(TimeUnit::Nanosecond, None), diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs index 4fc150ef29..bd1acd3f3a 100644 --- a/datafusion/expr/src/type_coercion/mod.rs +++ b/datafusion/expr/src/type_coercion/mod.rs @@ -51,6 +51,8 @@ pub fn is_signed_numeric(dt: &DataType) -> bool { | DataType::Float16 | DataType::Float32 | DataType::Float64 + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _), ) @@ -89,5 +91,11 @@ pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool { /// Determine whether the given data type `dt` is a `Decimal`. pub fn is_decimal(dt: &DataType) -> bool { - matches!(dt, DataType::Decimal128(_, _) | DataType::Decimal256(_, _)) + matches!( + dt, + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + ) } diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 2243682cfd..fa71a76c09 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -740,7 +740,20 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync { } /// If this function is ordered-set aggregate function, return true - /// If the function is not, return false + /// otherwise, return false + /// + /// Ordered-set aggregate functions require an explicit `ORDER BY` clause + /// because the calculation performed by these functions is dependent on the + /// specific sequence of the input rows, unlike other aggregate functions + /// like `SUM`, `AVG`, or `COUNT`. + /// + /// An example of an ordered-set aggregate function is `percentile_cont` + /// which computes a specific percentile value from a sorted list of values, and + /// is only meaningful when the input data is ordered. + /// + /// In SQL syntax, ordered-set aggregate functions are used with the + /// `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering of the input + /// data. fn is_ordered_set_aggregate(&self) -> bool { false } @@ -792,7 +805,7 @@ pub fn udaf_default_schema_name( // exclude the first function argument(= column) in ordered set aggregate function, // because it is duplicated with the WITHIN GROUP clause in schema name. - let args = if func.is_ordered_set_aggregate() { + let args = if func.is_ordered_set_aggregate() && !order_by.is_empty() { &args[1..] } else { &args[..] @@ -1221,37 +1234,6 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl { } } -// Aggregate UDF doc sections for use in public documentation -pub mod aggregate_doc_sections { - use crate::DocSection; - - pub fn doc_sections() -> Vec { - vec![ - DOC_SECTION_GENERAL, - DOC_SECTION_STATISTICAL, - DOC_SECTION_APPROXIMATE, - ] - } - - pub const DOC_SECTION_GENERAL: DocSection = DocSection { - include: true, - label: "General Functions", - description: None, - }; - - pub const DOC_SECTION_STATISTICAL: DocSection = DocSection { - include: true, - label: "Statistical Functions", - description: None, - }; - - pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection { - include: true, - label: "Approximate Functions", - description: None, - }; -} - /// Indicates whether an aggregation function is monotonic as a set /// function. A set function is monotonically increasing if its value /// increases as its argument grows (as a set). Formally, `f` is a diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 758c6b5f2f..bc9e62fe62 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -870,122 +870,6 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { } } -// Scalar UDF doc sections for use in public documentation -pub mod scalar_doc_sections { - use crate::DocSection; - - pub fn doc_sections() -> Vec { - vec![ - DOC_SECTION_MATH, - DOC_SECTION_CONDITIONAL, - DOC_SECTION_STRING, - DOC_SECTION_BINARY_STRING, - DOC_SECTION_REGEX, - DOC_SECTION_DATETIME, - DOC_SECTION_ARRAY, - DOC_SECTION_STRUCT, - DOC_SECTION_MAP, - DOC_SECTION_HASHING, - DOC_SECTION_UNION, - DOC_SECTION_OTHER, - ] - } - - pub const fn doc_sections_const() -> &'static [DocSection] { - &[ - DOC_SECTION_MATH, - DOC_SECTION_CONDITIONAL, - DOC_SECTION_STRING, - DOC_SECTION_BINARY_STRING, - DOC_SECTION_REGEX, - DOC_SECTION_DATETIME, - DOC_SECTION_ARRAY, - DOC_SECTION_STRUCT, - DOC_SECTION_MAP, - DOC_SECTION_HASHING, - DOC_SECTION_UNION, - DOC_SECTION_OTHER, - ] - } - - pub const DOC_SECTION_MATH: DocSection = DocSection { - include: true, - label: "Math Functions", - description: None, - }; - - pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection { - include: true, - label: "Conditional Functions", - description: None, - }; - - pub const DOC_SECTION_STRING: DocSection = DocSection { - include: true, - label: "String Functions", - description: None, - }; - - pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection { - include: true, - label: "Binary String Functions", - description: None, - }; - - pub const DOC_SECTION_REGEX: DocSection = DocSection { - include: true, - label: "Regular Expression Functions", - description: Some( - r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) -regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) -(minus support for several features including look-around and backreferences). -The following regular expression functions are supported:"#, - ), - }; - - pub const DOC_SECTION_DATETIME: DocSection = DocSection { - include: true, - label: "Time and Date Functions", - description: None, - }; - - pub const DOC_SECTION_ARRAY: DocSection = DocSection { - include: true, - label: "Array Functions", - description: None, - }; - - pub const DOC_SECTION_STRUCT: DocSection = DocSection { - include: true, - label: "Struct Functions", - description: None, - }; - - pub const DOC_SECTION_MAP: DocSection = DocSection { - include: true, - label: "Map Functions", - description: None, - }; - - pub const DOC_SECTION_HASHING: DocSection = DocSection { - include: true, - label: "Hashing Functions", - description: None, - }; - - pub const DOC_SECTION_OTHER: DocSection = DocSection { - include: true, - label: "Other Functions", - description: None, - }; - - pub const DOC_SECTION_UNION: DocSection = DocSection { - include: true, - label: "Union Functions", - description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"), - }; -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index a7b4302dd8..402add0391 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -525,37 +525,6 @@ impl WindowUDFImpl for AliasedWindowUDFImpl { } } -// Window UDF doc sections for use in public documentation -pub mod window_doc_sections { - use datafusion_doc::DocSection; - - pub fn doc_sections() -> Vec { - vec![ - DOC_SECTION_AGGREGATE, - DOC_SECTION_RANKING, - DOC_SECTION_ANALYTICAL, - ] - } - - pub const DOC_SECTION_AGGREGATE: DocSection = DocSection { - include: true, - label: "Aggregate Functions", - description: Some("All aggregate functions can be used as window functions."), - }; - - pub const DOC_SECTION_RANKING: DocSection = DocSection { - include: true, - label: "Ranking Functions", - description: None, - }; - - pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection { - include: true, - label: "Analytical Functions", - description: None, - }; -} - #[cfg(test)] mod test { use crate::{PartitionEvaluator, WindowUDF, WindowUDFImpl}; diff --git a/datafusion/expr/src/window_state.rs b/datafusion/expr/src/window_state.rs index 014bed5aea..12eda9a745 100644 --- a/datafusion/expr/src/window_state.rs +++ b/datafusion/expr/src/window_state.rs @@ -90,7 +90,12 @@ impl WindowAggState { partition_batch_state: &PartitionBatchState, ) -> Result<()> { self.last_calculated_index += out_col.len(); - self.out_col = concat(&[&self.out_col, &out_col])?; + // no need to use concat if the current `out_col` is empty + if self.out_col.is_empty() { + self.out_col = Arc::clone(out_col); + } else { + self.out_col = concat(&[&self.out_col, &out_col])?; + } self.n_row_result_missing = partition_batch_state.record_batch.num_rows() - self.last_calculated_index; self.is_end = partition_batch_state.is_end; @@ -265,6 +270,15 @@ impl PartitionBatchState { } } + pub fn new_with_batch(batch: RecordBatch) -> Self { + Self { + record_batch: batch, + most_recent_row: None, + is_end: false, + n_out_row: 0, + } + } + pub fn extend(&mut self, batch: &RecordBatch) -> Result<()> { self.record_batch = concat_batches(&self.record_batch.schema(), [&self.record_batch, batch])?; diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index 48283f4cfd..72070984f9 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -17,10 +17,10 @@ under the License. --> -# `datafusion-ffi`: Apache DataFusion Foreign Function Interface +# Apache DataFusion Foreign Function Interface -This crate contains code to allow interoperability of Apache [DataFusion] with -functions from other libraries and/or [DataFusion] versions using a stable +This crate contains code to allow interoperability of [Apache DataFusion] with +functions from other libraries and/or DataFusion versions using a stable interface. One of the limitations of the Rust programming language is that there is no @@ -28,10 +28,10 @@ stable [Rust ABI] (Application Binary Interface). If a library is compiled with one version of the Rust compiler and you attempt to use that library with a program compiled by a different Rust compiler, there is no guarantee that you can access the data structures. In order to share code between libraries loaded -at runtime, you need to use Rust's [FFI](Foreign Function Interface (FFI)). +at runtime, you need to use Rust's [FFI] (Foreign Function Interface (FFI)). -The purpose of this crate is to define interfaces between [DataFusion] libraries -that will remain stable across different versions of [DataFusion]. This allows +The purpose of this crate is to define interfaces between DataFusion libraries +that will remain stable across different versions of DataFusion. This allows users to write libraries that can interface between each other at runtime rather than require compiling all of the code into a single executable. @@ -46,7 +46,7 @@ See [API Docs] for details and examples. Two use cases have been identified for this crate, but they are not intended to be all inclusive. -1. `datafusion-python` which will use the FFI to provide external services such +1. [`datafusion-python`] which will use the FFI to provide external services such as a `TableProvider` without needing to re-export the entire `datafusion-python` code base. With `datafusion-ffi` these packages do not need `datafusion-python` as a dependency at all. @@ -68,8 +68,8 @@ stable interfaces that closely mirror the Rust native approach. To learn more about this approach see the [abi_stable] and [async-ffi] crates. If you have a library in another language that you wish to interface to -[DataFusion] the recommendation is to create a Rust wrapper crate to interface -with your library and then to connect it to [DataFusion] using this crate. +DataFusion the recommendation is to create a Rust wrapper crate to interface +with your library and then to connect it to DataFusion using this crate. Alternatively, you could use [bindgen] to interface directly to the [FFI] provided by this crate, but that is currently not supported. @@ -101,12 +101,12 @@ In this crate we have a variety of structs which closely mimic the behavior of their internal counterparts. To see detailed notes about how to use them, see the example in `FFI_TableProvider`. -[datafusion]: https://datafusion.apache.org +[apache datafusion]: https://datafusion.apache.org/ [api docs]: http://docs.rs/datafusion-ffi/latest [rust abi]: https://doc.rust-lang.org/reference/abi.html [ffi]: https://doc.rust-lang.org/nomicon/ffi.html [abi_stable]: https://crates.io/crates/abi_stable [async-ffi]: https://crates.io/crates/async-ffi [bindgen]: https://crates.io/crates/bindgen -[datafusion-python]: https://datafusion.apache.org/python/ +[`datafusion-python`]: https://datafusion.apache.org/python/ [datafusion-contrib]: https://github.com/datafusion-contrib diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 832e82dda3..48c2698a58 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -181,6 +181,7 @@ impl TryFrom for PlanProperties { // TODO Extend FFI to get the registry and codex let default_ctx = SessionContext::new(); + let task_context = default_ctx.task_ctx(); let codex = DefaultPhysicalExtensionCodec {}; let ffi_orderings = unsafe { (ffi_props.output_ordering)(&ffi_props) }; @@ -190,7 +191,7 @@ impl TryFrom for PlanProperties { .map_err(|e| DataFusionError::External(Box::new(e)))?; let sort_exprs = parse_physical_sort_exprs( &proto_output_ordering.physical_sort_expr_nodes, - &default_ctx, + &task_context, &schema, &codex, )?; @@ -202,7 +203,7 @@ impl TryFrom for PlanProperties { .map_err(|e| DataFusionError::External(Box::new(e)))?; let partitioning = parse_protobuf_partitioning( Some(&proto_output_partitioning), - &default_ctx, + &task_context, &schema, &codex, )? diff --git a/datafusion/ffi/src/udaf/accumulator_args.rs b/datafusion/ffi/src/udaf/accumulator_args.rs index 2cd2fa5f51..594b839458 100644 --- a/datafusion/ffi/src/udaf/accumulator_args.rs +++ b/datafusion/ffi/src/udaf/accumulator_args.rs @@ -116,16 +116,17 @@ impl TryFrom for ForeignAccumulatorArgs { let schema = Schema::try_from(&value.schema.0)?; let default_ctx = SessionContext::new(); + let task_ctx = default_ctx.task_ctx(); let codex = DefaultPhysicalExtensionCodec {}; let order_bys = parse_physical_sort_exprs( &proto_def.ordering_req, - &default_ctx, + &task_ctx, &schema, &codex, )?; - let exprs = parse_physical_exprs(&proto_def.expr, &default_ctx, &schema, &codex)?; + let exprs = parse_physical_exprs(&proto_def.expr, &task_ctx, &schema, &codex)?; Ok(Self { return_field, diff --git a/datafusion/ffi/src/udwf/partition_evaluator.rs b/datafusion/ffi/src/udwf/partition_evaluator.rs index 995d00cce3..14cf23b919 100644 --- a/datafusion/ffi/src/udwf/partition_evaluator.rs +++ b/datafusion/ffi/src/udwf/partition_evaluator.rs @@ -86,7 +86,7 @@ pub struct PartitionEvaluatorPrivateData { } impl FFI_PartitionEvaluator { - unsafe fn inner_mut(&mut self) -> &mut Box<(dyn PartitionEvaluator + 'static)> { + unsafe fn inner_mut(&mut self) -> &mut Box { let private_data = self.private_data as *mut PartitionEvaluatorPrivateData; &mut (*private_data).evaluator } diff --git a/datafusion/ffi/src/udwf/partition_evaluator_args.rs b/datafusion/ffi/src/udwf/partition_evaluator_args.rs index dffeb23741..b6f9d2a13e 100644 --- a/datafusion/ffi/src/udwf/partition_evaluator_args.rs +++ b/datafusion/ffi/src/udwf/partition_evaluator_args.rs @@ -148,7 +148,7 @@ impl TryFrom for ForeignPartitionEvaluatorArgs { .map_err(|e| DataFusionError::Execution(e.to_string()))? .iter() .map(|expr_node| { - parse_physical_expr(expr_node, &default_ctx, &schema, &codec) + parse_physical_expr(expr_node, &default_ctx.task_ctx(), &schema, &codec) }) .collect::>>()?; diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml index cf065ca1cb..a6e0a1fc2f 100644 --- a/datafusion/functions-aggregate-common/Cargo.toml +++ b/datafusion/functions-aggregate-common/Cargo.toml @@ -19,6 +19,7 @@ name = "datafusion-functions-aggregate-common" description = "Utility functions for implementing aggregate functions for the DataFusion query engine" keywords = ["datafusion", "logical", "plan", "expressions"] +readme = "README.md" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } diff --git a/datafusion/functions-aggregate-common/README.md b/datafusion/functions-aggregate-common/README.md index 61a81e8085..3d52aa7220 100644 --- a/datafusion/functions-aggregate-common/README.md +++ b/datafusion/functions-aggregate-common/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Aggregate Function Library +# Apache DataFusion Aggregate Function Common Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains common functionality for implementation aggregate and window functions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs index 3d6889431d..56cdaf6618 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +mod decimal; mod numeric; +pub use decimal::DecimalDistinctAvgAccumulator; pub use numeric::Float64DistinctAvgAccumulator; diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs new file mode 100644 index 0000000000..9920bf5bf4 --- /dev/null +++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs @@ -0,0 +1,282 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::{ + array::{ArrayRef, ArrowNumericType}, + datatypes::{ + i256, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, DecimalType, + }, +}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr_common::accumulator::Accumulator; +use std::fmt::Debug; +use std::mem::size_of_val; + +use crate::aggregate::sum_distinct::DistinctSumAccumulator; +use crate::utils::DecimalAverager; + +/// Generic implementation of `AVG DISTINCT` for Decimal types. +/// Handles both all Arrow decimal types (32, 64, 128 and 256 bits). +#[derive(Debug)] +pub struct DecimalDistinctAvgAccumulator { + sum_accumulator: DistinctSumAccumulator, + sum_scale: i8, + target_precision: u8, + target_scale: i8, +} + +impl DecimalDistinctAvgAccumulator { + pub fn with_decimal_params( + sum_scale: i8, + target_precision: u8, + target_scale: i8, + ) -> Self { + let data_type = T::TYPE_CONSTRUCTOR(T::MAX_PRECISION, sum_scale); + + Self { + sum_accumulator: DistinctSumAccumulator::new(&data_type), + sum_scale, + target_precision, + target_scale, + } + } +} + +impl Accumulator + for DecimalDistinctAvgAccumulator +{ + fn state(&mut self) -> Result> { + self.sum_accumulator.state() + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + self.sum_accumulator.update_batch(values) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + self.sum_accumulator.merge_batch(states) + } + + fn evaluate(&mut self) -> Result { + if self.sum_accumulator.distinct_count() == 0 { + return ScalarValue::new_primitive::( + None, + &T::TYPE_CONSTRUCTOR(self.target_precision, self.target_scale), + ); + } + + let sum_scalar = self.sum_accumulator.evaluate()?; + + match sum_scalar { + ScalarValue::Decimal32(Some(sum), _, _) => { + let decimal_averager = DecimalAverager::::try_new( + self.sum_scale, + self.target_precision, + self.target_scale, + )?; + let avg = decimal_averager + .avg(sum, self.sum_accumulator.distinct_count() as i32)?; + Ok(ScalarValue::Decimal32( + Some(avg), + self.target_precision, + self.target_scale, + )) + } + ScalarValue::Decimal64(Some(sum), _, _) => { + let decimal_averager = DecimalAverager::::try_new( + self.sum_scale, + self.target_precision, + self.target_scale, + )?; + let avg = decimal_averager + .avg(sum, self.sum_accumulator.distinct_count() as i64)?; + Ok(ScalarValue::Decimal64( + Some(avg), + self.target_precision, + self.target_scale, + )) + } + ScalarValue::Decimal128(Some(sum), _, _) => { + let decimal_averager = DecimalAverager::::try_new( + self.sum_scale, + self.target_precision, + self.target_scale, + )?; + let avg = decimal_averager + .avg(sum, self.sum_accumulator.distinct_count() as i128)?; + Ok(ScalarValue::Decimal128( + Some(avg), + self.target_precision, + self.target_scale, + )) + } + ScalarValue::Decimal256(Some(sum), _, _) => { + let decimal_averager = DecimalAverager::::try_new( + self.sum_scale, + self.target_precision, + self.target_scale, + )?; + // `distinct_count` returns `u64`, but `avg` expects `i256` + // first convert `u64` to `i128`, then convert `i128` to `i256` to avoid overflow + let distinct_cnt: i128 = self.sum_accumulator.distinct_count() as i128; + let count: i256 = i256::from_i128(distinct_cnt); + let avg = decimal_averager.avg(sum, count)?; + Ok(ScalarValue::Decimal256( + Some(avg), + self.target_precision, + self.target_scale, + )) + } + + _ => unreachable!("Unsupported decimal type: {:?}", sum_scalar), + } + } + + fn size(&self) -> usize { + let fixed_size = size_of_val(self); + + // Account for the size of the sum_accumulator with its contained values + fixed_size + self.sum_accumulator.size() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ + Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + }; + use std::sync::Arc; + + #[test] + fn test_decimal32_distinct_avg_accumulator() -> Result<()> { + let precision = 5_u8; + let scale = 2_i8; + let array = Decimal32Array::from(vec![ + Some(10_00), + Some(12_50), + Some(17_50), + Some(20_00), + Some(20_00), + Some(30_00), + None, + None, + ]) + .with_precision_and_scale(precision, scale)?; + + let mut accumulator = + DecimalDistinctAvgAccumulator::::with_decimal_params( + scale, 9, 6, + ); + accumulator.update_batch(&[Arc::new(array)])?; + + let result = accumulator.evaluate()?; + let expected_result = ScalarValue::Decimal32(Some(18000000), 9, 6); + assert_eq!(result, expected_result); + + Ok(()) + } + + #[test] + fn test_decimal64_distinct_avg_accumulator() -> Result<()> { + let precision = 10_u8; + let scale = 4_i8; + let array = Decimal64Array::from(vec![ + Some(100_0000), + Some(125_0000), + Some(175_0000), + Some(200_0000), + Some(200_0000), + Some(300_0000), + None, + None, + ]) + .with_precision_and_scale(precision, scale)?; + + let mut accumulator = + DecimalDistinctAvgAccumulator::::with_decimal_params( + scale, 14, 8, + ); + accumulator.update_batch(&[Arc::new(array)])?; + + let result = accumulator.evaluate()?; + let expected_result = ScalarValue::Decimal64(Some(180_00000000), 14, 8); + assert_eq!(result, expected_result); + + Ok(()) + } + + #[test] + fn test_decimal128_distinct_avg_accumulator() -> Result<()> { + let precision = 10_u8; + let scale = 4_i8; + let array = Decimal128Array::from(vec![ + Some(100_0000), + Some(125_0000), + Some(175_0000), + Some(200_0000), + Some(200_0000), + Some(300_0000), + None, + None, + ]) + .with_precision_and_scale(precision, scale)?; + + let mut accumulator = + DecimalDistinctAvgAccumulator::::with_decimal_params( + scale, 14, 8, + ); + accumulator.update_batch(&[Arc::new(array)])?; + + let result = accumulator.evaluate()?; + let expected_result = ScalarValue::Decimal128(Some(180_00000000), 14, 8); + assert_eq!(result, expected_result); + + Ok(()) + } + + #[test] + fn test_decimal256_distinct_avg_accumulator() -> Result<()> { + let precision = 50_u8; + let scale = 2_i8; + + let array = Decimal256Array::from(vec![ + Some(i256::from_i128(10_000)), + Some(i256::from_i128(12_500)), + Some(i256::from_i128(17_500)), + Some(i256::from_i128(20_000)), + Some(i256::from_i128(20_000)), + Some(i256::from_i128(30_000)), + None, + None, + ]) + .with_precision_and_scale(precision, scale)?; + + let mut accumulator = + DecimalDistinctAvgAccumulator::::with_decimal_params( + scale, 54, 6, + ); + accumulator.update_batch(&[Arc::new(array)])?; + + let result = accumulator.evaluate()?; + let expected_result = + ScalarValue::Decimal256(Some(i256::from_i128(180_000000)), 54, 6); + assert_eq!(result, expected_result); + + Ok(()) + } +} diff --git a/datafusion/functions-aggregate-common/src/min_max.rs b/datafusion/functions-aggregate-common/src/min_max.rs index 806071dd2f..7dd60e1c0e 100644 --- a/datafusion/functions-aggregate-common/src/min_max.rs +++ b/datafusion/functions-aggregate-common/src/min_max.rs @@ -19,15 +19,15 @@ use arrow::array::{ ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Decimal256Array, DurationMicrosecondArray, - DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, - FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, - IntervalYearMonthArray, LargeBinaryArray, LargeStringArray, StringArray, - StringViewArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, - Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, + DurationSecondArray, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, + IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, + LargeStringArray, StringArray, StringViewArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::compute; use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; @@ -144,6 +144,32 @@ macro_rules! min_max { ($VALUE:expr, $DELTA:expr, $OP:ident) => {{ Ok(match ($VALUE, $DELTA) { (ScalarValue::Null, ScalarValue::Null) => ScalarValue::Null, + ( + lhs @ ScalarValue::Decimal32(lhsv, lhsp, lhss), + rhs @ ScalarValue::Decimal32(rhsv, rhsp, rhss) + ) => { + if lhsp.eq(rhsp) && lhss.eq(rhss) { + typed_min_max!(lhsv, rhsv, Decimal32, $OP, lhsp, lhss) + } else { + return internal_err!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?}", + (lhs, rhs) + ); + } + } + ( + lhs @ ScalarValue::Decimal64(lhsv, lhsp, lhss), + rhs @ ScalarValue::Decimal64(rhsv, rhsp, rhss) + ) => { + if lhsp.eq(rhsp) && lhss.eq(rhss) { + typed_min_max!(lhsv, rhsv, Decimal64, $OP, lhsp, lhss) + } else { + return internal_err!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?}", + (lhs, rhs) + ); + } + } ( lhs @ ScalarValue::Decimal128(lhsv, lhsp, lhss), rhs @ ScalarValue::Decimal128(rhsv, rhsp, rhss) @@ -513,6 +539,26 @@ macro_rules! min_max_batch { ($VALUES:expr, $OP:ident) => {{ match $VALUES.data_type() { DataType::Null => ScalarValue::Null, + DataType::Decimal32(precision, scale) => { + typed_min_max_batch!( + $VALUES, + Decimal32Array, + Decimal32, + $OP, + precision, + scale + ) + } + DataType::Decimal64(precision, scale) => { + typed_min_max_batch!( + $VALUES, + Decimal64Array, + Decimal64, + $OP, + precision, + scale + ) + } DataType::Decimal128(precision, scale) => { typed_min_max_batch!( $VALUES, @@ -659,7 +705,7 @@ macro_rules! min_max_batch { other => { // This should have been handled before return datafusion_common::internal_err!( - "Min/Max accumulator not implemented for type {:?}", + "Min/Max accumulator not implemented for type {}", other ); } diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs index 38a9292cea..370a640b04 100644 --- a/datafusion/functions-aggregate-common/src/tdigest.rs +++ b/datafusion/functions-aggregate-common/src/tdigest.rs @@ -45,7 +45,7 @@ macro_rules! cast_scalar_f64 { ($value:expr ) => { match &$value { ScalarValue::Float64(Some(v)) => *v, - v => panic!("invalid type {:?}", v), + v => panic!("invalid type {}", v), } }; } @@ -56,7 +56,7 @@ macro_rules! cast_scalar_u64 { ($value:expr ) => { match &$value { ScalarValue::UInt64(Some(v)) => *v, - v => panic!("invalid type {:?}", v), + v => panic!("invalid type {}", v), } }; } diff --git a/datafusion/functions-aggregate/README.md b/datafusion/functions-aggregate/README.md index 244112d4fd..aa50eaeeda 100644 --- a/datafusion/functions-aggregate/README.md +++ b/datafusion/functions-aggregate/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Aggregate Function Library +# Apache DataFusion Aggregate Function Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains implementations of aggregate functions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 74aa1bf68c..abb144c045 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -23,8 +23,11 @@ use arrow::array::{ GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; use arrow::datatypes::{ - ArrowPrimitiveType, FieldRef, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int16Type, Int32Type, + Int64Type, Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, + Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, + UInt8Type, }; use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::ScalarValue; @@ -169,6 +172,9 @@ where } } +#[derive(Debug)] +struct NullHLLAccumulator; + macro_rules! default_accumulator_impl { () => { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { @@ -264,6 +270,29 @@ where default_accumulator_impl!(); } +impl Accumulator for NullHLLAccumulator { + fn update_batch(&mut self, _values: &[ArrayRef]) -> Result<()> { + // do nothing, all values are null + Ok(()) + } + + fn merge_batch(&mut self, _states: &[ArrayRef]) -> Result<()> { + Ok(()) + } + + fn state(&mut self) -> Result> { + Ok(vec![]) + } + + fn evaluate(&mut self) -> Result { + Ok(ScalarValue::UInt64(Some(0))) + } + + fn size(&self) -> usize { + size_of_val(self) + } +} + impl Debug for ApproxDistinct { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("ApproxDistinct") @@ -347,11 +376,38 @@ impl AggregateUDFImpl for ApproxDistinct { DataType::Int16 => Box::new(NumericHLLAccumulator::::new()), DataType::Int32 => Box::new(NumericHLLAccumulator::::new()), DataType::Int64 => Box::new(NumericHLLAccumulator::::new()), + DataType::Date32 => Box::new(NumericHLLAccumulator::::new()), + DataType::Date64 => Box::new(NumericHLLAccumulator::::new()), + DataType::Time32(TimeUnit::Second) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Time32(TimeUnit::Millisecond) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Time64(TimeUnit::Microsecond) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Time64(TimeUnit::Nanosecond) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Timestamp(TimeUnit::Second, _) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + Box::new(NumericHLLAccumulator::::new()) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + Box::new(NumericHLLAccumulator::::new()) + } DataType::Utf8 => Box::new(StringHLLAccumulator::::new()), DataType::LargeUtf8 => Box::new(StringHLLAccumulator::::new()), DataType::Utf8View => Box::new(StringViewHLLAccumulator::::new()), DataType::Binary => Box::new(BinaryHLLAccumulator::::new()), DataType::LargeBinary => Box::new(BinaryHLLAccumulator::::new()), + DataType::Null => Box::new(NullHLLAccumulator), other => { return not_impl_err!( "Support for 'approx_distinct' for data type {other} is not implemented" diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 640e0e5bac..0deb09184b 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -384,19 +384,23 @@ impl ApproxPercentileAccumulator { } } - // public for approx_percentile_cont_with_weight + // pub(crate) for approx_percentile_cont_with_weight pub(crate) fn max_size(&self) -> usize { self.digest.max_size() } - // public for approx_percentile_cont_with_weight - pub fn merge_digests(&mut self, digests: &[TDigest]) { + // pub(crate) for approx_percentile_cont_with_weight + pub(crate) fn merge_digests(&mut self, digests: &[TDigest]) { let digests = digests.iter().chain(std::iter::once(&self.digest)); self.digest = TDigest::merge_digests(digests) } - // public for approx_percentile_cont_with_weight - pub fn convert_to_float(values: &ArrayRef) -> Result> { + // pub(crate) for approx_percentile_cont_with_weight + pub(crate) fn convert_to_float(values: &ArrayRef) -> Result> { + debug_assert!( + values.null_count() == 0, + "convert_to_float assumes nulls have already been filtered out" + ); match values.data_type() { DataType::Float64 => { let array = downcast_value!(values, Float64Array); @@ -493,7 +497,7 @@ impl Accumulator for ApproxPercentileAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { // Remove any nulls before computing the percentile let mut values = Arc::clone(&values[0]); - if values.nulls().is_some() { + if values.null_count() > 0 { values = filter(&values, &is_not_null(&values)?)?; } let sorted_values = &arrow::compute::sort(&values, None)?; @@ -521,7 +525,7 @@ impl Accumulator for ApproxPercentileAccumulator { DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)), DataType::Float32 => ScalarValue::Float32(Some(q as f32)), DataType::Float64 => ScalarValue::Float64(Some(q)), - v => unreachable!("unexpected return type {:?}", v), + v => unreachable!("unexpected return type {}", v), }) } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 637de83fa8..89ff546039 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -21,6 +21,7 @@ use std::hash::Hash; use std::mem::size_of_val; use std::sync::Arc; +use arrow::compute::{and, filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{array::ArrayRef, datatypes::DataType}; use datafusion_common::ScalarValue; @@ -268,15 +269,37 @@ impl Accumulator for ApproxPercentileWithWeightAccumulator { } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - let means = &values[0]; - let weights = &values[1]; + let mut means = Arc::clone(&values[0]); + let mut weights = Arc::clone(&values[1]); + // If nulls are present in either array, need to filter those rows out in both arrays + match (means.null_count() > 0, weights.null_count() > 0) { + // Both have nulls + (true, true) => { + let predicate = and(&is_not_null(&means)?, &is_not_null(&weights)?)?; + means = filter(&means, &predicate)?; + weights = filter(&weights, &predicate)?; + } + // Only one has nulls + (false, true) => { + let predicate = &is_not_null(&weights)?; + means = filter(&means, predicate)?; + weights = filter(&weights, predicate)?; + } + (true, false) => { + let predicate = &is_not_null(&means)?; + means = filter(&means, predicate)?; + weights = filter(&weights, predicate)?; + } + // No nulls + (false, false) => {} + } debug_assert_eq!( means.len(), weights.len(), "invalid number of values in means and weights" ); - let means_f64 = ApproxPercentileAccumulator::convert_to_float(means)?; - let weights_f64 = ApproxPercentileAccumulator::convert_to_float(weights)?; + let means_f64 = ApproxPercentileAccumulator::convert_to_float(&means)?; + let weights_f64 = ApproxPercentileAccumulator::convert_to_float(&weights)?; let mut digests: Vec = vec![]; for (mean, weight) in means_f64.iter().zip(weights_f64.iter()) { digests.push(TDigest::new_with_centroid( diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index f7cb74fd55..d007163e7c 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -24,9 +24,11 @@ use arrow::array::{ use arrow::compute::sum; use arrow::datatypes::{ - i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, DecimalType, - DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, - DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, UInt64Type, + i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, Decimal32Type, + Decimal64Type, DecimalType, DurationMicrosecondType, DurationMillisecondType, + DurationNanosecondType, DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, + UInt64Type, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; use datafusion_common::{ exec_err, not_impl_err, utils::take_function_args, Result, ScalarValue, @@ -36,11 +38,13 @@ use datafusion_expr::type_coercion::aggregates::{avg_return_type, coerce_avg_typ use datafusion_expr::utils::format_state_name; use datafusion_expr::Volatility::Immutable; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, + Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, GroupsAccumulator, ReversedUDAF, Signature, }; -use datafusion_functions_aggregate_common::aggregate::avg_distinct::Float64DistinctAvgAccumulator; +use datafusion_functions_aggregate_common::aggregate::avg_distinct::{ + DecimalDistinctAvgAccumulator, Float64DistinctAvgAccumulator, +}; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{ filtered_null_mask, set_nulls, @@ -62,6 +66,17 @@ make_udaf_expr_and_func!( avg_udaf ); +pub fn avg_distinct(expr: Expr) -> Expr { + Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf( + avg_udaf(), + vec![expr], + true, + None, + vec![], + None, + )) +} + #[user_doc( doc_section(label = "General Functions"), description = "Returns the average of numeric values in the specified column.", @@ -120,14 +135,75 @@ impl AggregateUDFImpl for Avg { // instantiate specialized accumulator based for the type if acc_args.is_distinct { - match &data_type { + match (&data_type, acc_args.return_type()) { // Numeric types are converted to Float64 via `coerce_avg_type` during logical plan creation - Float64 => Ok(Box::new(Float64DistinctAvgAccumulator::default())), - _ => exec_err!("AVG(DISTINCT) for {} not supported", data_type), + (Float64, _) => Ok(Box::new(Float64DistinctAvgAccumulator::default())), + + ( + Decimal32(_, scale), + Decimal32(target_precision, target_scale), + ) => Ok(Box::new(DecimalDistinctAvgAccumulator::::with_decimal_params( + *scale, + *target_precision, + *target_scale, + ))), + ( + Decimal64(_, scale), + Decimal64(target_precision, target_scale), + ) => Ok(Box::new(DecimalDistinctAvgAccumulator::::with_decimal_params( + *scale, + *target_precision, + *target_scale, + ))), + ( + Decimal128(_, scale), + Decimal128(target_precision, target_scale), + ) => Ok(Box::new(DecimalDistinctAvgAccumulator::::with_decimal_params( + *scale, + *target_precision, + *target_scale, + ))), + + ( + Decimal256(_, scale), + Decimal256(target_precision, target_scale), + ) => Ok(Box::new(DecimalDistinctAvgAccumulator::::with_decimal_params( + *scale, + *target_precision, + *target_scale, + ))), + + (dt, return_type) => exec_err!( + "AVG(DISTINCT) for ({} --> {}) not supported", + dt, + return_type + ), } } else { - match (&data_type, acc_args.return_field.data_type()) { + match (&data_type, acc_args.return_type()) { (Float64, Float64) => Ok(Box::::default()), + ( + Decimal32(sum_precision, sum_scale), + Decimal32(target_precision, target_scale), + ) => Ok(Box::new(DecimalAvgAccumulator:: { + sum: None, + count: 0, + sum_scale: *sum_scale, + sum_precision: *sum_precision, + target_precision: *target_precision, + target_scale: *target_scale, + })), + ( + Decimal64(sum_precision, sum_scale), + Decimal64(target_precision, target_scale), + ) => Ok(Box::new(DecimalAvgAccumulator:: { + sum: None, + count: 0, + sum_scale: *sum_scale, + sum_precision: *sum_precision, + target_precision: *target_precision, + target_scale: *target_scale, + })), ( Decimal128(sum_precision, sum_scale), Decimal128(target_precision, target_scale), @@ -161,22 +237,37 @@ impl AggregateUDFImpl for Avg { })) } - _ => exec_err!( - "AvgAccumulator for ({} --> {})", - &data_type, - acc_args.return_field.data_type() - ), + (dt, return_type) => { + exec_err!("AvgAccumulator for ({} --> {})", dt, return_type) + } } } } fn state_fields(&self, args: StateFieldsArgs) -> Result> { if args.is_distinct { - // Copied from datafusion_functions_aggregate::sum::Sum::state_fields + // Decimal accumulator actually uses a different precision during accumulation, + // see DecimalDistinctAvgAccumulator::with_decimal_params + let dt = match args.input_fields[0].data_type() { + DataType::Decimal32(_, scale) => { + DataType::Decimal32(DECIMAL32_MAX_PRECISION, *scale) + } + DataType::Decimal64(_, scale) => { + DataType::Decimal64(DECIMAL64_MAX_PRECISION, *scale) + } + DataType::Decimal128(_, scale) => { + DataType::Decimal128(DECIMAL128_MAX_PRECISION, *scale) + } + DataType::Decimal256(_, scale) => { + DataType::Decimal256(DECIMAL256_MAX_PRECISION, *scale) + } + _ => args.return_type().clone(), + }; + // Similar to datafusion_functions_aggregate::sum::Sum::state_fields // since the accumulator uses DistinctSumAccumulator internally. Ok(vec![Field::new_list( format_state_name(args.name, "avg distinct"), - Field::new_list_field(args.return_type().clone(), true), + Field::new_list_field(dt, true), false, ) .into()]) @@ -202,7 +293,12 @@ impl AggregateUDFImpl for Avg { fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool { matches!( args.return_field.data_type(), - DataType::Float64 | DataType::Decimal128(_, _) | DataType::Duration(_) + DataType::Float64 + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::Duration(_) ) && !args.is_distinct } @@ -222,6 +318,44 @@ impl AggregateUDFImpl for Avg { |sum: f64, count: u64| Ok(sum / count as f64), ))) } + ( + Decimal32(_sum_precision, sum_scale), + Decimal32(target_precision, target_scale), + ) => { + let decimal_averager = DecimalAverager::::try_new( + *sum_scale, + *target_precision, + *target_scale, + )?; + + let avg_fn = + move |sum: i32, count: u64| decimal_averager.avg(sum, count as i32); + + Ok(Box::new(AvgGroupsAccumulator::::new( + &data_type, + args.return_field.data_type(), + avg_fn, + ))) + } + ( + Decimal64(_sum_precision, sum_scale), + Decimal64(target_precision, target_scale), + ) => { + let decimal_averager = DecimalAverager::::try_new( + *sum_scale, + *target_precision, + *target_scale, + )?; + + let avg_fn = + move |sum: i64, count: u64| decimal_averager.avg(sum, count as i64); + + Ok(Box::new(AvgGroupsAccumulator::::new( + &data_type, + args.return_field.data_type(), + avg_fn, + ))) + } ( Decimal128(_sum_precision, sum_scale), Decimal128(target_precision, target_scale), @@ -405,7 +539,7 @@ impl Accumulator for DecimalAvgAccumu self.count += (values.len() - values.null_count()) as u64; if let Some(x) = sum(values) { - let v = self.sum.get_or_insert(T::Native::default()); + let v = self.sum.get_or_insert_with(T::Native::default); self.sum = Some(v.add_wrapping(x)); } Ok(()) @@ -450,7 +584,7 @@ impl Accumulator for DecimalAvgAccumu // sums are summed if let Some(x) = sum(states[1].as_primitive::()) { - let v = self.sum.get_or_insert(T::Native::default()); + let v = self.sum.get_or_insert_with(T::Native::default); self.sum = Some(v.add_wrapping(x)); } Ok(()) @@ -605,7 +739,7 @@ where { pub fn new(sum_data_type: &DataType, return_data_type: &DataType, avg_fn: F) -> Self { debug!( - "AvgGroupsAccumulator ({}, sum type: {sum_data_type:?}) --> {return_data_type:?}", + "AvgGroupsAccumulator ({}, sum type: {sum_data_type}) --> {return_data_type}", std::any::type_name::() ); diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index 7f0fd8e514..e63044c753 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -40,7 +40,7 @@ use datafusion_expr::{ Signature, Volatility, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; use std::sync::LazyLock; @@ -382,7 +382,7 @@ where { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { if let Some(x) = arrow::compute::bit_or(values[0].as_primitive::()) { - let v = self.value.get_or_insert(T::Native::usize_as(0)); + let v = self.value.get_or_insert_with(|| T::Native::usize_as(0)); *v = *v | x; } Ok(()) @@ -427,7 +427,7 @@ where { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { if let Some(x) = arrow::compute::bit_xor(values[0].as_primitive::()) { - let v = self.value.get_or_insert(T::Native::usize_as(0)); + let v = self.value.get_or_insert_with(|| T::Native::usize_as(0)); *v = *v ^ x; } Ok(()) diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs index 47fdb7504d..20f23662ca 100644 --- a/datafusion/functions-aggregate/src/correlation.rs +++ b/datafusion/functions-aggregate/src/correlation.rs @@ -194,6 +194,11 @@ impl Accumulator for CorrelationAccumulator { } fn evaluate(&mut self) -> Result { + let n = self.covar.get_count(); + if n < 2 { + return Ok(ScalarValue::Float64(None)); + } + let covar = self.covar.evaluate()?; let stddev1 = self.stddev1.evaluate()?; let stddev2 = self.stddev2.evaluate()?; @@ -202,7 +207,7 @@ impl Accumulator for CorrelationAccumulator { if let ScalarValue::Float64(Some(s1)) = stddev1 { if let ScalarValue::Float64(Some(s2)) = stddev2 { if s1 == 0_f64 || s2 == 0_f64 { - return Ok(ScalarValue::Float64(Some(0_f64))); + return Ok(ScalarValue::Float64(None)); } else { return Ok(ScalarValue::Float64(Some(c / s1 / s2))); } @@ -459,11 +464,8 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator { // the `denominator` state is 0. In these cases, the final aggregation // result should be `Null` (according to PostgreSQL's behavior). // - // TODO: Old datafusion implementation returns 0.0 for these invalid cases. - // Update this to match PostgreSQL's behavior. for i in 0..n { if self.count[i] < 2 { - // TODO: Evaluate as `Null` (see notes above) values.push(0.0); nulls.append_null(); continue; @@ -484,7 +486,6 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator { ((sum_xx - sum_x * mean_x) * (sum_yy - sum_y * mean_y)).sqrt(); if denominator == 0.0 { - // TODO: Evaluate as `Null` (see notes above) values.push(0.0); nulls.append_null(); } else { diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 6ef1332ba0..28755427c7 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -30,12 +30,12 @@ use arrow::array::{ use arrow::buffer::{BooleanBuffer, NullBuffer}; use arrow::compute::{self, LexicographicalComparator, SortColumn, SortOptions}; use arrow::datatypes::{ - DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Field, FieldRef, - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, - TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, + Decimal64Type, Field, FieldRef, Float16Type, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, Time32MillisecondType, Time32SecondType, + Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; use datafusion_common::cast::as_boolean_array; use datafusion_common::utils::{compare_rows, extract_row_at_idx_to_buf, get_row_at_idx}; @@ -185,6 +185,8 @@ impl AggregateUDFImpl for FirstValue { | Float16 | Float32 | Float64 + | Decimal32(_, _) + | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _) | Date32 @@ -234,6 +236,8 @@ impl AggregateUDFImpl for FirstValue { DataType::Float32 => create_accumulator::(args), DataType::Float64 => create_accumulator::(args), + DataType::Decimal32(_, _) => create_accumulator::(args), + DataType::Decimal64(_, _) => create_accumulator::(args), DataType::Decimal128(_, _) => create_accumulator::(args), DataType::Decimal256(_, _) => create_accumulator::(args), @@ -1124,6 +1128,8 @@ impl AggregateUDFImpl for LastValue { | Float16 | Float32 | Float64 + | Decimal32(_, _) + | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _) | Date32 @@ -1175,6 +1181,8 @@ impl AggregateUDFImpl for LastValue { DataType::Float32 => create_accumulator::(args), DataType::Float64 => create_accumulator::(args), + DataType::Decimal32(_, _) => create_accumulator::(args), + DataType::Decimal64(_, _) => create_accumulator::(args), DataType::Decimal128(_, _) => create_accumulator::(args), DataType::Decimal256(_, _) => create_accumulator::(args), diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index b5bb69f6da..8236d456fd 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -105,6 +105,7 @@ pub mod expr_fn { pub use super::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight; pub use super::array_agg::array_agg; pub use super::average::avg; + pub use super::average::avg_distinct; pub use super::bit_and_or_xor::bit_and; pub use super::bit_and_or_xor::bit_or; pub use super::bit_and_or_xor::bit_xor; @@ -134,6 +135,7 @@ pub mod expr_fn { pub use super::stddev::stddev; pub use super::stddev::stddev_pop; pub use super::sum::sum; + pub use super::sum::sum_distinct; pub use super::variance::var_pop; pub use super::variance::var_sample; } diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index a73ccbd99b..a65759594e 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -35,7 +35,9 @@ use arrow::{ use arrow::array::Array; use arrow::array::ArrowNativeTypeOp; -use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, FieldRef}; +use arrow::datatypes::{ + ArrowNativeType, ArrowPrimitiveType, Decimal32Type, Decimal64Type, FieldRef, +}; use datafusion_common::{ internal_datafusion_err, internal_err, DataFusionError, HashSet, Result, ScalarValue, @@ -166,6 +168,8 @@ impl AggregateUDFImpl for Median { DataType::Float16 => helper!(Float16Type, dt), DataType::Float32 => helper!(Float32Type, dt), DataType::Float64 => helper!(Float64Type, dt), + DataType::Decimal32(_, _) => helper!(Decimal32Type, dt), + DataType::Decimal64(_, _) => helper!(Decimal64Type, dt), DataType::Decimal128(_, _) => helper!(Decimal128Type, dt), DataType::Decimal256(_, _) => helper!(Decimal256Type, dt), _ => Err(DataFusionError::NotImplemented(format!( @@ -205,6 +209,8 @@ impl AggregateUDFImpl for Median { DataType::Float16 => helper!(Float16Type, dt), DataType::Float32 => helper!(Float32Type, dt), DataType::Float64 => helper!(Float64Type, dt), + DataType::Decimal32(_, _) => helper!(Decimal32Type, dt), + DataType::Decimal64(_, _) => helper!(Decimal64Type, dt), DataType::Decimal128(_, _) => helper!(Decimal128Type, dt), DataType::Decimal256(_, _) => helper!(Decimal256Type, dt), _ => Err(DataFusionError::NotImplemented(format!( diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 639c08706b..1a46afefff 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -23,10 +23,10 @@ mod min_max_struct; use arrow::array::ArrayRef; use arrow::datatypes::{ - DataType, Decimal128Type, Decimal256Type, DurationMicrosecondType, - DurationMillisecondType, DurationNanosecondType, DurationSecondType, Float16Type, - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, + DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, + DurationSecondType, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use datafusion_common::stats::Precision; use datafusion_common::{exec_err, internal_err, ColumnStatistics, Result}; @@ -239,6 +239,8 @@ impl AggregateUDFImpl for Max { | Float16 | Float32 | Float64 + | Decimal32(_, _) + | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _) | Date32 @@ -320,6 +322,12 @@ impl AggregateUDFImpl for Max { Duration(Nanosecond) => { primitive_max_accumulator!(data_type, i64, DurationNanosecondType) } + Decimal32(_, _) => { + primitive_max_accumulator!(data_type, i32, Decimal32Type) + } + Decimal64(_, _) => { + primitive_max_accumulator!(data_type, i64, Decimal64Type) + } Decimal128(_, _) => { primitive_max_accumulator!(data_type, i128, Decimal128Type) } @@ -518,6 +526,8 @@ impl AggregateUDFImpl for Min { | Float16 | Float32 | Float64 + | Decimal32(_, _) + | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _) | Date32 @@ -599,6 +609,12 @@ impl AggregateUDFImpl for Min { Duration(Nanosecond) => { primitive_min_accumulator!(data_type, i64, DurationNanosecondType) } + Decimal32(_, _) => { + primitive_min_accumulator!(data_type, i32, Decimal32Type) + } + Decimal64(_, _) => { + primitive_min_accumulator!(data_type, i64, Decimal64Type) + } Decimal128(_, _) => { primitive_min_accumulator!(data_type, i128, Decimal128Type) } diff --git a/datafusion/functions-aggregate/src/regr.rs b/datafusion/functions-aggregate/src/regr.rs index 77bb04bf2c..44ce0bd48e 100644 --- a/datafusion/functions-aggregate/src/regr.rs +++ b/datafusion/functions-aggregate/src/regr.rs @@ -29,7 +29,7 @@ use datafusion_common::{ downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, HashMap, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; +use datafusion_doc::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index 445c7dfe6b..958553d78c 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -18,7 +18,10 @@ //! Defines `SUM` and `SUM DISTINCT` aggregate accumulators use ahash::RandomState; +use arrow::datatypes::DECIMAL32_MAX_PRECISION; +use arrow::datatypes::DECIMAL64_MAX_PRECISION; use datafusion_expr::utils::AggregateOrderSensitivity; +use datafusion_expr::Expr; use std::any::Any; use std::mem::size_of_val; @@ -27,8 +30,8 @@ use arrow::array::ArrowNativeTypeOp; use arrow::array::{ArrowNumericType, AsArray}; use arrow::datatypes::{ArrowNativeType, FieldRef}; use arrow::datatypes::{ - DataType, Decimal128Type, Decimal256Type, Float64Type, Int64Type, UInt64Type, - DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float64Type, + Int64Type, UInt64Type, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, }; use arrow::{array::ArrayRef, datatypes::Field}; use datafusion_common::{ @@ -53,6 +56,17 @@ make_udaf_expr_and_func!( sum_udaf ); +pub fn sum_distinct(expr: Expr) -> Expr { + Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf( + sum_udaf(), + vec![expr], + true, + None, + vec![], + None, + )) +} + /// Sum only supports a subset of numeric types, instead relying on type coercion /// /// This macro is similar to [downcast_primitive](arrow::array::downcast_primitive) @@ -71,6 +85,12 @@ macro_rules! downcast_sum { DataType::Float64 => { $helper!(Float64Type, $args.return_field.data_type().clone()) } + DataType::Decimal32(_, _) => { + $helper!(Decimal32Type, $args.return_field.data_type().clone()) + } + DataType::Decimal64(_, _) => { + $helper!(Decimal64Type, $args.return_field.data_type().clone()) + } DataType::Decimal128(_, _) => { $helper!(Decimal128Type, $args.return_field.data_type().clone()) } @@ -145,13 +165,14 @@ impl AggregateUDFImpl for Sum { DataType::Dictionary(_, v) => coerced_type(v), // in the spark, the result type is DECIMAL(min(38,precision+10), s) // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - Ok(data_type.clone()) - } + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) => Ok(data_type.clone()), dt if dt.is_signed_integer() => Ok(DataType::Int64), dt if dt.is_unsigned_integer() => Ok(DataType::UInt64), dt if dt.is_floating() => Ok(DataType::Float64), - _ => exec_err!("Sum not supported for {}", data_type), + _ => exec_err!("Sum not supported for {data_type}"), } } @@ -163,6 +184,18 @@ impl AggregateUDFImpl for Sum { DataType::Int64 => Ok(DataType::Int64), DataType::UInt64 => Ok(DataType::UInt64), DataType::Float64 => Ok(DataType::Float64), + DataType::Decimal32(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal32(new_precision, *scale)) + } + DataType::Decimal64(precision, scale) => { + // in the spark, the result type is DECIMAL(min(38,precision+10), s) + // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 + let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10); + Ok(DataType::Decimal64(new_precision, *scale)) + } DataType::Decimal128(precision, scale) => { // in the spark, the result type is DECIMAL(min(38,precision+10), s) // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66 @@ -314,7 +347,7 @@ impl Accumulator for SumAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = values[0].as_primitive::(); if let Some(x) = arrow::compute::sum(values) { - let v = self.sum.get_or_insert(T::Native::usize_as(0)); + let v = self.sum.get_or_insert_with(|| T::Native::usize_as(0)); *v = v.add_wrapping(x); } Ok(()) diff --git a/datafusion/functions-nested/README.md b/datafusion/functions-nested/README.md index 0fa93619b9..6ab456edb1 100644 --- a/datafusion/functions-nested/README.md +++ b/datafusion/functions-nested/README.md @@ -17,16 +17,18 @@ under the License. --> -# DataFusion Nested Type Function Library +# Apache DataFusion Nested Type Function Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains functions for working with arrays, maps and structs, such as `array_append` that work with -`ListArray`, `LargeListArray` and `FixedListArray` types from the `arrow` crate. +`ListArray`, `LargeListArray` and `FixedListArray` types from the [`arrow`] crate. Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[`arrow`]: https://crates.io/crates/arrow [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index af9000fd75..f77cc5dd7b 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -25,10 +25,10 @@ use datafusion_common::cast::{as_fixed_size_list_array, as_generic_list_array}; use datafusion_common::utils::string_utils::string_array_to_vec; use datafusion_common::utils::take_function_args; use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; -use datafusion_expr::expr::{InList, ScalarFunction}; +use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::ExprSimplifyResult; use datafusion_expr::{ - ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, + in_list, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; use datafusion_physical_expr_common::datum::compare_with_eq; @@ -131,40 +131,42 @@ impl ScalarUDFImpl for ArrayHas { // if the haystack is a constant list, we can use an inlist expression which is more // efficient because the haystack is not varying per-row - if let Expr::Literal(ScalarValue::List(array), _) = haystack { - // TODO: support LargeList - // (not supported by `convert_array_to_scalar_vec`) - // (FixedSizeList not supported either, but seems to have worked fine when attempting to - // build a reproducer) - - assert_eq!(array.len(), 1); // guarantee of ScalarValue - if let Ok(scalar_values) = - ScalarValue::convert_array_to_scalar_vec(array.as_ref()) - { - assert_eq!(scalar_values.len(), 1); - let list = scalar_values - .into_iter() - .flatten() - .map(|v| Expr::Literal(v, None)) - .collect(); - - return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList { - expr: Box::new(std::mem::take(needle)), - list, - negated: false, - }))); + match haystack { + Expr::Literal( + // FixedSizeList gets coerced to List + scalar @ ScalarValue::List(_) | scalar @ ScalarValue::LargeList(_), + _, + ) => { + let array = scalar.to_array().unwrap(); // guarantee of ScalarValue + if let Ok(scalar_values) = + ScalarValue::convert_array_to_scalar_vec(&array) + { + assert_eq!(scalar_values.len(), 1); + let list = scalar_values + .into_iter() + .flatten() + .map(|v| Expr::Literal(v, None)) + .collect(); + + return Ok(ExprSimplifyResult::Simplified(in_list( + std::mem::take(needle), + list, + false, + ))); + } } - } else if let Expr::ScalarFunction(ScalarFunction { func, args }) = haystack { - // make_array has a static set of arguments, so we can pull the arguments out from it - if func == &make_array_udf() { - return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList { - expr: Box::new(std::mem::take(needle)), - list: std::mem::take(args), - negated: false, - }))); + Expr::ScalarFunction(ScalarFunction { func, args }) + if func == &make_array_udf() => + { + // make_array has a static set of arguments, so we can pull the arguments out from it + return Ok(ExprSimplifyResult::Simplified(in_list( + std::mem::take(needle), + std::mem::take(args), + false, + ))); } - } - + _ => {} + }; Ok(ExprSimplifyResult::Original(args)) } @@ -497,7 +499,7 @@ impl Default for ArrayHasAll { impl ArrayHasAll { pub fn new() -> Self { Self { - signature: Signature::any(2, Volatility::Immutable), + signature: Signature::arrays(2, None, Volatility::Immutable), aliases: vec![String::from("list_has_all")], } } @@ -571,7 +573,7 @@ impl Default for ArrayHasAny { impl ArrayHasAny { pub fn new() -> Self { Self { - signature: Signature::any(2, Volatility::Immutable), + signature: Signature::arrays(2, None, Volatility::Immutable), aliases: vec![String::from("list_has_any"), String::from("arrays_overlap")], } } diff --git a/datafusion/functions-nested/src/cardinality.rs b/datafusion/functions-nested/src/cardinality.rs index 1f0f508c6d..6db0011cd0 100644 --- a/datafusion/functions-nested/src/cardinality.rs +++ b/datafusion/functions-nested/src/cardinality.rs @@ -58,7 +58,6 @@ impl Cardinality { ], Volatility::Immutable, ), - aliases: vec![], } } } @@ -83,7 +82,6 @@ impl Cardinality { #[derive(Debug, PartialEq, Eq, Hash)] pub struct Cardinality { signature: Signature, - aliases: Vec, } impl Default for Cardinality { @@ -114,10 +112,6 @@ impl ScalarUDFImpl for Cardinality { make_scalar_function(cardinality_inner)(&args.args) } - fn aliases(&self) -> &[String] { - &self.aliases - } - fn documentation(&self) -> Option<&Documentation> { self.doc() } diff --git a/datafusion/functions-nested/src/concat.rs b/datafusion/functions-nested/src/concat.rs index 43a00fefcb..9a12db525f 100644 --- a/datafusion/functions-nested/src/concat.rs +++ b/datafusion/functions-nested/src/concat.rs @@ -319,8 +319,9 @@ impl ScalarUDFImpl for ArrayConcat { } } else { plan_err!( - "Failed to unify argument types of {}: {arg_types:?}", - self.name() + "Failed to unify argument types of {}: [{}]", + self.name(), + arg_types.iter().join(", ") ) } } diff --git a/datafusion/functions-nested/src/distance.rs b/datafusion/functions-nested/src/distance.rs index 1ccd22cdf5..e2e38fbd0d 100644 --- a/datafusion/functions-nested/src/distance.rs +++ b/datafusion/functions-nested/src/distance.rs @@ -30,13 +30,11 @@ use datafusion_common::cast::{ as_int64_array, }; use datafusion_common::utils::{coerced_type_with_base_type_only, ListCoercion}; -use datafusion_common::{ - exec_err, internal_datafusion_err, plan_err, utils::take_function_args, Result, -}; +use datafusion_common::{exec_err, plan_err, utils::take_function_args, Result}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; -use datafusion_functions::{downcast_arg, downcast_named_arg}; +use datafusion_functions::downcast_arg; use datafusion_macros::user_doc; use itertools::Itertools; use std::any::Any; diff --git a/datafusion/functions-nested/src/except.rs b/datafusion/functions-nested/src/except.rs index a7ce36ae33..d6982ab5a2 100644 --- a/datafusion/functions-nested/src/except.rs +++ b/datafusion/functions-nested/src/except.rs @@ -22,7 +22,7 @@ use arrow::array::{cast::AsArray, Array, ArrayRef, GenericListArray, OffsetSizeT use arrow::buffer::OffsetBuffer; use arrow::datatypes::{DataType, FieldRef}; use arrow::row::{RowConverter, SortField}; -use datafusion_common::utils::take_function_args; +use datafusion_common::utils::{take_function_args, ListCoercion}; use datafusion_common::{internal_err, HashSet, Result}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, @@ -81,7 +81,11 @@ impl Default for ArrayExcept { impl ArrayExcept { pub fn new() -> Self { Self { - signature: Signature::any(2, Volatility::Immutable), + signature: Signature::arrays( + 2, + Some(ListCoercion::FixedSizedListToList), + Volatility::Immutable, + ), aliases: vec!["list_except".to_string()], } } diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index 7fb38ace69..7aad167f1d 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -337,7 +337,7 @@ impl ArraySlice { ArrayFunctionArgument::Index, ArrayFunctionArgument::Index, ], - array_coercion: None, + array_coercion: Some(ListCoercion::FixedSizedListToList), }), TypeSignature::ArraySignature(ArrayFunctionSignature::Array { arguments: vec![ @@ -346,7 +346,7 @@ impl ArraySlice { ArrayFunctionArgument::Index, ArrayFunctionArgument::Index, ], - array_coercion: None, + array_coercion: Some(ListCoercion::FixedSizedListToList), }), ], Volatility::Immutable, @@ -451,7 +451,7 @@ fn array_slice_inner(args: &[ArrayRef]) -> Result { let array = as_large_list_array(&args[0])?; general_array_slice::(array, from_array, to_array, stride) } - _ => exec_err!("array_slice does not support type: {:?}", array_data_type), + _ => exec_err!("array_slice does not support type: {}", array_data_type), } } @@ -672,15 +672,7 @@ pub(super) struct ArrayPopFront { impl ArrayPopFront { pub fn new() -> Self { Self { - signature: Signature { - type_signature: TypeSignature::ArraySignature( - ArrayFunctionSignature::Array { - arguments: vec![ArrayFunctionArgument::Array], - array_coercion: Some(ListCoercion::FixedSizedListToList), - }, - ), - volatility: Volatility::Immutable, - }, + signature: Signature::array(Volatility::Immutable), aliases: vec![String::from("list_pop_front")], } } @@ -730,10 +722,7 @@ fn array_pop_front_inner(args: &[ArrayRef]) -> Result { let array = as_large_list_array(&args[0])?; general_pop_front_list::(array) } - _ => exec_err!( - "array_pop_front does not support type: {:?}", - array_data_type - ), + _ => exec_err!("array_pop_front does not support type: {}", array_data_type), } } @@ -779,15 +768,7 @@ pub(super) struct ArrayPopBack { impl ArrayPopBack { pub fn new() -> Self { Self { - signature: Signature { - type_signature: TypeSignature::ArraySignature( - ArrayFunctionSignature::Array { - arguments: vec![ArrayFunctionArgument::Array], - array_coercion: Some(ListCoercion::FixedSizedListToList), - }, - ), - volatility: Volatility::Immutable, - }, + signature: Signature::array(Volatility::Immutable), aliases: vec![String::from("list_pop_back")], } } @@ -839,7 +820,7 @@ fn array_pop_back_inner(args: &[ArrayRef]) -> Result { general_pop_back_list::(array) } _ => exec_err!( - "array_pop_back does not support type: {:?}", + "array_pop_back does not support type: {}", array.data_type() ), } @@ -942,7 +923,7 @@ fn array_any_value_inner(args: &[ArrayRef]) -> Result { let array = as_large_list_array(&array)?; general_array_any_value::(array) } - data_type => exec_err!("array_any_value does not support type: {:?}", data_type), + data_type => exec_err!("array_any_value does not support type: {data_type}"), } } diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs index 413b28aa80..1b74af643c 100644 --- a/datafusion/functions-nested/src/flatten.rs +++ b/datafusion/functions-nested/src/flatten.rs @@ -25,11 +25,9 @@ use arrow::datatypes::{ DataType::{FixedSizeList, LargeList, List, Null}, }; use datafusion_common::cast::{as_large_list_array, as_list_array}; -use datafusion_common::utils::ListCoercion; use datafusion_common::{exec_err, utils::take_function_args, Result}; use datafusion_expr::{ - ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation, - ScalarUDFImpl, Signature, TypeSignature, Volatility, + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -75,15 +73,7 @@ impl Default for Flatten { impl Flatten { pub fn new() -> Self { Self { - signature: Signature { - type_signature: TypeSignature::ArraySignature( - ArrayFunctionSignature::Array { - arguments: vec![ArrayFunctionArgument::Array], - array_coercion: Some(ListCoercion::FixedSizedListToList), - }, - ), - volatility: Volatility::Immutable, - }, + signature: Signature::array(Volatility::Immutable), aliases: vec![], } } @@ -104,7 +94,7 @@ impl ScalarUDFImpl for Flatten { fn return_type(&self, arg_types: &[DataType]) -> Result { let data_type = match &arg_types[0] { - List(field) | FixedSizeList(field, _) => match field.data_type() { + List(field) => match field.data_type() { List(field) | FixedSizeList(field, _) => List(Arc::clone(field)), _ => arg_types[0].clone(), }, diff --git a/datafusion/functions-nested/src/length.rs b/datafusion/functions-nested/src/length.rs index d829fac27b..060a978185 100644 --- a/datafusion/functions-nested/src/length.rs +++ b/datafusion/functions-nested/src/length.rs @@ -29,11 +29,12 @@ use arrow::datatypes::{ use datafusion_common::cast::{ as_fixed_size_list_array, as_generic_list_array, as_int64_array, }; -use datafusion_common::{exec_err, internal_datafusion_err, plan_err, Result}; +use datafusion_common::{exec_err, Result}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation, + ScalarUDFImpl, Signature, TypeSignature, Volatility, }; -use datafusion_functions::{downcast_arg, downcast_named_arg}; +use datafusion_functions::downcast_arg; use datafusion_macros::user_doc; use std::any::Any; use std::sync::Arc; @@ -79,7 +80,22 @@ impl Default for ArrayLength { impl ArrayLength { pub fn new() -> Self { Self { - signature: Signature::variadic_any(Volatility::Immutable), + signature: Signature::one_of( + vec![ + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array], + array_coercion: None, + }), + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Index, + ], + array_coercion: None, + }), + ], + Volatility::Immutable, + ), aliases: vec![String::from("list_length")], } } @@ -97,13 +113,8 @@ impl ScalarUDFImpl for ArrayLength { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(match arg_types[0] { - List(_) | LargeList(_) | FixedSizeList(_, _) => UInt64, - _ => { - return plan_err!("The array_length function can only accept List/LargeList/FixedSizeList."); - } - }) + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(UInt64) } fn invoke_with_args( @@ -149,7 +160,7 @@ pub fn array_length_inner(args: &[ArrayRef]) -> Result { List(_) => general_array_length::(args), LargeList(_) => general_array_length::(args), FixedSizeList(_, _) => fixed_size_array_length(args), - array_type => exec_err!("array_length does not support type '{array_type:?}'"), + array_type => exec_err!("array_length does not support type '{array_type}'"), } } diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index e4dd9b3662..97d64c70cd 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -39,6 +39,7 @@ use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use datafusion_macros::user_doc; +use itertools::Itertools as _; make_udf_expr_and_func!( MakeArray, @@ -132,8 +133,9 @@ impl ScalarUDFImpl for MakeArray { Ok(vec![unified; arg_types.len()]) } else { plan_err!( - "Failed to unify argument types of {}: {arg_types:?}", - self.name() + "Failed to unify argument types of {}: [{}]", + self.name(), + arg_types.iter().join(", ") ) } } diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs index b7a9b878c6..f4fa8630a8 100644 --- a/datafusion/functions-nested/src/planner.rs +++ b/datafusion/functions-nested/src/planner.rs @@ -108,7 +108,7 @@ impl ExprPlanner for NestedFunctionPlanner { } fn plan_make_map(&self, args: Vec) -> Result>> { - if args.len() % 2 != 0 { + if !args.len().is_multiple_of(2) { return plan_err!("make_map requires an even number of arguments"); } diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs index be7ce05b69..dae946def8 100644 --- a/datafusion/functions-nested/src/position.rs +++ b/datafusion/functions-nested/src/position.rs @@ -147,7 +147,7 @@ pub fn array_position_inner(args: &[ArrayRef]) -> Result { match &args[0].data_type() { List(_) => general_position_dispatch::(args), LargeList(_) => general_position_dispatch::(args), - array_type => exec_err!("array_position does not support type '{array_type:?}'."), + array_type => exec_err!("array_position does not support type '{array_type}'."), } } fn general_position_dispatch(args: &[ArrayRef]) -> Result { @@ -308,7 +308,7 @@ pub fn array_positions_inner(args: &[ArrayRef]) -> Result { general_positions::(arr, element) } array_type => { - exec_err!("array_positions does not support type '{array_type:?}'.") + exec_err!("array_positions does not support type '{array_type}'.") } } } diff --git a/datafusion/functions-nested/src/remove.rs b/datafusion/functions-nested/src/remove.rs index e500d71967..d330606cdd 100644 --- a/datafusion/functions-nested/src/remove.rs +++ b/datafusion/functions-nested/src/remove.rs @@ -26,9 +26,11 @@ use arrow::array::{ use arrow::buffer::OffsetBuffer; use arrow::datatypes::{DataType, Field}; use datafusion_common::cast::as_int64_array; +use datafusion_common::utils::ListCoercion; use datafusion_common::{exec_err, utils::take_function_args, Result}; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation, + ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use datafusion_macros::user_doc; use std::any::Any; @@ -156,7 +158,17 @@ pub(super) struct ArrayRemoveN { impl ArrayRemoveN { pub fn new() -> Self { Self { - signature: Signature::any(3, Volatility::Immutable), + signature: Signature::new( + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Element, + ArrayFunctionArgument::Index, + ], + array_coercion: Some(ListCoercion::FixedSizedListToList), + }), + Volatility::Immutable, + ), aliases: vec!["list_remove_n".to_string()], } } @@ -311,7 +323,7 @@ fn array_remove_internal( general_remove::(list_array, element_array, arr_n) } array_type => { - exec_err!("array_remove_all does not support type '{array_type:?}'.") + exec_err!("array_remove_all does not support type '{array_type}'.") } } } diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs index d791c6d1f1..59f851a776 100644 --- a/datafusion/functions-nested/src/replace.rs +++ b/datafusion/functions-nested/src/replace.rs @@ -430,7 +430,7 @@ pub(crate) fn array_replace_inner(args: &[ArrayRef]) -> Result { general_replace::(list_array, from, to, arr_n) } DataType::Null => Ok(new_null_array(array.data_type(), 1)), - array_type => exec_err!("array_replace does not support type '{array_type:?}'."), + array_type => exec_err!("array_replace does not support type '{array_type}'."), } } @@ -450,7 +450,7 @@ pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result { } DataType::Null => Ok(new_null_array(array.data_type(), 1)), array_type => { - exec_err!("array_replace_n does not support type '{array_type:?}'.") + exec_err!("array_replace_n does not support type '{array_type}'.") } } } @@ -471,7 +471,7 @@ pub(crate) fn array_replace_all_inner(args: &[ArrayRef]) -> Result { } DataType::Null => Ok(new_null_array(array.data_type(), 1)), array_type => { - exec_err!("array_replace_all does not support type '{array_type:?}'.") + exec_err!("array_replace_all does not support type '{array_type}'.") } } } diff --git a/datafusion/functions-nested/src/resize.rs b/datafusion/functions-nested/src/resize.rs index db27d649a4..09f67a75fd 100644 --- a/datafusion/functions-nested/src/resize.rs +++ b/datafusion/functions-nested/src/resize.rs @@ -26,7 +26,7 @@ use arrow::buffer::OffsetBuffer; use arrow::datatypes::DataType; use arrow::datatypes::{ArrowNativeType, Field}; use arrow::datatypes::{ - DataType::{FixedSizeList, LargeList, List}, + DataType::{LargeList, List}, FieldRef, }; use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array}; @@ -125,7 +125,7 @@ impl ScalarUDFImpl for ArrayResize { fn return_type(&self, arg_types: &[DataType]) -> Result { match &arg_types[0] { - List(field) | FixedSizeList(field, _) => Ok(List(Arc::clone(field))), + List(field) => Ok(List(Arc::clone(field))), LargeList(field) => Ok(LargeList(Arc::clone(field))), DataType::Null => { Ok(List(Arc::new(Field::new_list_field(DataType::Int64, true)))) @@ -191,7 +191,7 @@ pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result { let array = as_large_list_array(&arg[0])?; general_list_resize::(array, new_len, field, new_element) } - array_type => exec_err!("array_resize does not support type '{array_type:?}'."), + array_type => exec_err!("array_resize does not support type '{array_type}'."), } } diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs index 5b134fe5b2..8440d890d2 100644 --- a/datafusion/functions-nested/src/reverse.rs +++ b/datafusion/functions-nested/src/reverse.rs @@ -76,7 +76,7 @@ impl Default for ArrayReverse { impl ArrayReverse { pub fn new() -> Self { Self { - signature: Signature::any(1, Volatility::Immutable), + signature: Signature::array(Volatility::Immutable), aliases: vec!["list_reverse".to_string()], } } @@ -133,7 +133,7 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result { fixed_size_array_reverse(array, field) } Null => Ok(Arc::clone(input_array)), - array_type => exec_err!("array_reverse does not support type '{array_type:?}'."), + array_type => exec_err!("array_reverse does not support type '{array_type}'."), } } @@ -199,7 +199,7 @@ fn fixed_size_array_reverse( // skip the null value if array.is_null(row_index) { nulls.push(false); - mutable.extend(0, 0, 1); + mutable.extend(0, 0, value_length); continue; } else { nulls.push(true); diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs index 3284ee58dd..4a7aa31c75 100644 --- a/datafusion/functions-nested/src/sort.rs +++ b/datafusion/functions-nested/src/sort.rs @@ -18,12 +18,14 @@ //! [`ScalarUDFImpl`] definitions for array_sort function. use crate::utils::make_scalar_function; -use arrow::array::{new_null_array, Array, ArrayRef, ListArray, NullBufferBuilder}; +use arrow::array::{ + new_null_array, Array, ArrayRef, GenericListArray, NullBufferBuilder, OffsetSizeTrait, +}; use arrow::buffer::OffsetBuffer; use arrow::compute::SortColumn; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{DataType, FieldRef}; use arrow::{compute, compute::SortOptions}; -use datafusion_common::cast::{as_list_array, as_string_array}; +use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array}; use datafusion_common::utils::ListCoercion; use datafusion_common::{exec_err, plan_err, Result}; use datafusion_expr::{ @@ -137,6 +139,9 @@ impl ScalarUDFImpl for ArraySort { DataType::List(field) => { Ok(DataType::new_list(field.data_type().clone(), true)) } + DataType::LargeList(field) => { + Ok(DataType::new_large_list(field.data_type().clone(), true)) + } arg_type => { plan_err!("{} does not support type {arg_type}", self.name()) } @@ -165,13 +170,7 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { return exec_err!("array_sort expects one to three arguments"); } - if args[0].data_type().is_null() { - return Ok(Arc::clone(&args[0])); - } - - let list_array = as_list_array(&args[0])?; - let row_count = list_array.len(); - if row_count == 0 || list_array.value_type().is_null() { + if args[0].is_empty() || args[0].data_type().is_null() { return Ok(Arc::clone(&args[0])); } @@ -179,7 +178,7 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { return Ok(new_null_array(args[0].data_type(), args[0].len())); } - let sort_option = match args.len() { + let sort_options = match args.len() { 1 => None, 2 => { let sort = as_string_array(&args[1])?.value(0); @@ -196,9 +195,37 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { nulls_first: order_nulls_first(nulls_first)?, }) } - _ => return exec_err!("array_sort expects 1 to 3 arguments"), + // We guard at the top + _ => unreachable!(), }; + match args[0].data_type() { + DataType::List(field) | DataType::LargeList(field) + if field.data_type().is_null() => + { + Ok(Arc::clone(&args[0])) + } + DataType::List(field) => { + let array = as_list_array(&args[0])?; + array_sort_generic(array, field, sort_options) + } + DataType::LargeList(field) => { + let array = as_large_list_array(&args[0])?; + array_sort_generic(array, field, sort_options) + } + // Signature should prevent this arm ever occurring + _ => exec_err!("array_sort expects list for first argument"), + } +} + +/// Array_sort SQL function +pub fn array_sort_generic( + list_array: &GenericListArray, + field: &FieldRef, + sort_options: Option, +) -> Result { + let row_count = list_array.len(); + let mut array_lengths = vec![]; let mut arrays = vec![]; let mut valid = NullBufferBuilder::new(row_count); @@ -216,14 +243,14 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { DataType::Struct(_) => { let sort_columns: Vec = vec![SortColumn { values: Arc::clone(&arr_ref), - options: sort_option, + options: sort_options, }]; let indices = compute::lexsort_to_indices(&sort_columns, None)?; compute::take(arr_ref.as_ref(), &indices, None)? } _ => { let arr_ref = arr_ref.as_ref(); - compute::sort(arr_ref, sort_option)? + compute::sort(arr_ref, sort_options)? } }; array_lengths.push(sorted_array.len()); @@ -232,8 +259,6 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { } } - // Assume all arrays have the same data type - let data_type = list_array.value_type(); let buffer = valid.finish(); let elements = arrays @@ -242,10 +267,10 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result { .collect::>(); let list_arr = if elements.is_empty() { - ListArray::new_null(Arc::new(Field::new_list_field(data_type, true)), row_count) + GenericListArray::::new_null(Arc::clone(field), row_count) } else { - ListArray::new( - Arc::new(Field::new_list_field(data_type, true)), + GenericListArray::::new( + Arc::clone(field), OffsetBuffer::from_lengths(array_lengths), Arc::new(compute::concat(elements.as_slice())?), buffer, diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index f7bcdb547f..3373f7a983 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -25,9 +25,8 @@ use arrow::array::{ }; use arrow::datatypes::{DataType, Field}; -use datafusion_common::{ - internal_datafusion_err, not_impl_err, plan_err, DataFusionError, Result, -}; +use datafusion_common::utils::ListCoercion; +use datafusion_common::{not_impl_err, DataFusionError, Result}; use std::any::Any; @@ -41,14 +40,17 @@ use arrow::compute::cast; use arrow::datatypes::DataType::{ Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View, }; -use datafusion_common::cast::{as_large_list_array, as_list_array}; +use datafusion_common::cast::{ + as_fixed_size_list_array, as_large_list_array, as_list_array, +}; use datafusion_common::exec_err; use datafusion_common::types::logical_string; use datafusion_expr::{ - Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, - TypeSignatureClass, Volatility, + ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue, + Documentation, ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass, + Volatility, }; -use datafusion_functions::{downcast_arg, downcast_named_arg}; +use datafusion_functions::downcast_arg; use datafusion_macros::user_doc; use std::sync::Arc; @@ -161,7 +163,26 @@ impl Default for ArrayToString { impl ArrayToString { pub fn new() -> Self { Self { - signature: Signature::variadic_any(Volatility::Immutable), + signature: Signature::one_of( + vec![ + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::String, + ArrayFunctionArgument::String, + ], + array_coercion: Some(ListCoercion::FixedSizedListToList), + }), + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::String, + ], + array_coercion: Some(ListCoercion::FixedSizedListToList), + }), + ], + Volatility::Immutable, + ), aliases: vec![ String::from("list_to_string"), String::from("array_join"), @@ -184,13 +205,8 @@ impl ScalarUDFImpl for ArrayToString { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(match arg_types[0] { - List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8, - _ => { - return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList."); - } - }) + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Utf8) } fn invoke_with_args( @@ -284,16 +300,10 @@ impl ScalarUDFImpl for StringToArray { } fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(match arg_types[0] { - Utf8 | Utf8View | LargeUtf8 => { - List(Arc::new(Field::new_list_field(arg_types[0].clone(), true))) - } - _ => { - return plan_err!( - "The string_to_array function can only accept Utf8, Utf8View or LargeUtf8." - ); - } - }) + Ok(List(Arc::new(Field::new_list_field( + arg_types[0].clone(), + true, + )))) } fn invoke_with_args( @@ -370,6 +380,20 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } + FixedSizeList(..) => { + let list_array = as_fixed_size_list_array(&arr)?; + for i in 0..list_array.len() { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } + + Ok(arg) + } LargeList(..) => { let list_array = as_large_list_array(&arr)?; for i in 0..list_array.len() { @@ -451,9 +475,8 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(StringArray::from(res)) } - let arr_type = arr.data_type(); - let string_arr = match arr_type { - List(_) | FixedSizeList(_, _) => { + let string_arr = match arr.data_type() { + List(_) => { let list_array = as_list_array(&arr)?; generate_string_array::( list_array, @@ -471,29 +494,8 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { with_null_string, )? } - _ => { - let mut arg = String::from(""); - let mut res: Vec> = Vec::new(); - // delimiter length is 1 - assert_eq!(delimiters.len(), 1); - let delimiter = delimiters[0].unwrap(); - let s = compute_array_to_string( - &mut arg, - Arc::clone(arr), - delimiter.to_string(), - null_string, - with_null_string, - )? - .clone(); - - if !s.is_empty() { - let s = s.strip_suffix(delimiter).unwrap().to_string(); - res.push(Some(s)); - } else { - res.push(Some(s)); - } - StringArray::from(res) - } + // Signature guards against this arm + _ => return exec_err!("array_to_string expects list as first argument"), }; Ok(Arc::new(string_arr)) diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs index ed08a82358..464301b6ff 100644 --- a/datafusion/functions-nested/src/utils.rs +++ b/datafusion/functions-nested/src/utils.rs @@ -31,6 +31,7 @@ use datafusion_common::cast::{ use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; +use itertools::Itertools as _; pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> { let data_type = args[0].data_type(); @@ -39,7 +40,10 @@ pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> { || arg.data_type().equals_datatype(&DataType::Null) }) { let types = args.iter().map(|arg| arg.data_type()).collect::>(); - return plan_err!("{name} received incompatible types: '{types:?}'."); + return plan_err!( + "{name} received incompatible types: {}", + types.iter().join(", ") + ); } Ok(()) @@ -260,7 +264,7 @@ pub(crate) fn get_map_entry_field(data_type: &DataType) -> Result<&Fields> { } } } - _ => internal_err!("Expected a Map type, got {:?}", data_type), + _ => internal_err!("Expected a Map type, got {data_type}"), } } diff --git a/datafusion/functions-table/README.md b/datafusion/functions-table/README.md index 485abe560d..89f589a958 100644 --- a/datafusion/functions-table/README.md +++ b/datafusion/functions-table/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Table Function Library +# Apache DataFusion Table Function Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains table functions that can be used in DataFusion queries. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-window-common/README.md b/datafusion/functions-window-common/README.md index 9f64c9dc82..f2e4588072 100644 --- a/datafusion/functions-window-common/README.md +++ b/datafusion/functions-window-common/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Window Function Common Library +# Apache DataFusion Window Function Common Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains common functions for implementing window functions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-window/README.md b/datafusion/functions-window/README.md index 746d625b4f..f2bb9f53f5 100644 --- a/datafusion/functions-window/README.md +++ b/datafusion/functions-window/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Window Function Library +# Apache DataFusion Window Function Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains window function definitions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-window/src/lead_lag.rs b/datafusion/functions-window/src/lead_lag.rs index 7950cc93f8..c4cbc268cd 100644 --- a/datafusion/functions-window/src/lead_lag.rs +++ b/datafusion/functions-window/src/lead_lag.rs @@ -23,7 +23,7 @@ use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue}; -use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; +use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL; use datafusion_expr::{ Documentation, Literal, PartitionEvaluator, ReversedUDWF, Signature, TypeSignature, Volatility, WindowUDFImpl, diff --git a/datafusion/functions-window/src/nth_value.rs b/datafusion/functions-window/src/nth_value.rs index 309978e9e7..02beeec4ec 100644 --- a/datafusion/functions-window/src/nth_value.rs +++ b/datafusion/functions-window/src/nth_value.rs @@ -23,7 +23,7 @@ use arrow::datatypes::FieldRef; use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::datatypes::{DataType, Field}; use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue}; -use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; +use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL; use datafusion_expr::window_state::WindowAggState; use datafusion_expr::{ Documentation, Literal, PartitionEvaluator, ReversedUDWF, Signature, TypeSignature, diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs index bc88572a92..51ec4bbdbf 100644 --- a/datafusion/functions-window/src/rank.rs +++ b/datafusion/functions-window/src/rank.rs @@ -27,7 +27,7 @@ use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::utils::get_row_at_idx; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; +use datafusion_doc::window_doc_sections::DOC_SECTION_RANKING; use datafusion_expr::{ Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, }; diff --git a/datafusion/functions/README.md b/datafusion/functions/README.md index 27dc4afc76..dee1330422 100644 --- a/datafusion/functions/README.md +++ b/datafusion/functions/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Function Library +# Apache DataFusion Function Library -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains packages of function that can be used to customize the functionality of DataFusion. @@ -28,5 +28,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index cd5d987006..c18241f799 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -267,11 +267,11 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - regexp_replace::( + regexp_replace::( data.as_string::(), regex.as_string::(), replacement.as_string::(), - Some(&flags), + Some(flags.as_string::()), ) .expect("regexp_replace should work on valid values"), ) @@ -282,19 +282,18 @@ fn criterion_benchmark(c: &mut Criterion) { let mut rng = rand::rng(); let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap(); let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap(); - // flags are not allowed to be utf8view according to the function - let flags = Arc::new(flags(&mut rng)) as ArrayRef; + let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap(); let replacement = Arc::new(StringViewArray::from_iter_values(iter::repeat_n( "XX", 1000, ))); b.iter(|| { black_box( - regexp_replace::( + regexp_replace::( data.as_string_view(), regex.as_string_view(), - &replacement, - Some(&flags), + &*replacement, + Some(flags.as_string_view()), ) .expect("regexp_replace should work on valid values"), ) diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 81dd3c7797..d18bd6e31f 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -256,7 +256,7 @@ impl ScalarUDFImpl for GetFieldFunc { (DataType::Map(_, _), other) => { let data_type = other.data_type(); if data_type.is_nested() { - exec_err!("unsupported type {:?} for map access", data_type) + exec_err!("unsupported type {} for map access", data_type) } else { process_map_array(array, other.to_array()?) } @@ -275,7 +275,7 @@ impl ScalarUDFImpl for GetFieldFunc { (DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)), (dt, name) => exec_err!( "get_field is only possible on maps with utf8 indexes or struct \ - with utf8 indexes. Received {dt:?} with {name:?} index" + with utf8 indexes. Received {dt} with {name:?} index" ), } } diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs index c04074b2a8..1da5148474 100644 --- a/datafusion/functions/src/core/named_struct.rs +++ b/datafusion/functions/src/core/named_struct.rs @@ -104,7 +104,7 @@ impl ScalarUDFImpl for NamedStructFunc { ); } - if args.scalar_arguments.len() % 2 != 0 { + if !args.scalar_arguments.len().is_multiple_of(2) { return exec_err!( "named_struct requires an even number of arguments, got {} instead", args.scalar_arguments.len() diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 0f55bddcc9..82aa8d2a4c 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -113,7 +113,7 @@ impl ScalarUDFImpl for NVL2Func { if let Some(coerced_type) = coerced_type { Ok(coerced_type) } else { - internal_err!("Coercion from {acc:?} to {x:?} failed.") + internal_err!("Coercion from {acc} to {x} failed.") } })?; Ok(vec![new_type; arg_types.len()]) diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs index ed238f0074..5d6adfb6f1 100644 --- a/datafusion/functions/src/datetime/from_unixtime.rs +++ b/datafusion/functions/src/datetime/from_unixtime.rs @@ -133,7 +133,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc { if args[0].data_type() != Int64 { return exec_err!( - "Unsupported data type {:?} for function from_unixtime", + "Unsupported data type {} for function from_unixtime", args[0].data_type() ); } @@ -145,7 +145,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc { .cast_to(&Timestamp(Second, Some(Arc::from(tz.to_string()))), None), _ => { exec_err!( - "Unsupported data type {:?} for function from_unixtime", + "Unsupported data type {} for function from_unixtime", args[1].data_type() ) } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 3341a5dbb5..3840c8d8bb 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -39,7 +39,7 @@ Returns the corresponding date. Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`.", syntax_example = "to_date('2017-05-31', '%Y-%m-%d')", sql_example = r#"```sql -> select to_date('2023-01-31'); +> select to_date('2023-01-31'); +-------------------------------+ | to_date(Utf8("2023-01-31")) | +-------------------------------+ @@ -150,7 +150,7 @@ impl ScalarUDFImpl for ToDateFunc { } Utf8View | LargeUtf8 | Utf8 => self.to_date(&args), other => { - exec_err!("Unsupported data type {:?} for function to_date", other) + exec_err!("Unsupported data type {} for function to_date", other) } } } diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index 34914d256c..d2a5f8102b 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -368,10 +368,7 @@ impl ScalarUDFImpl for ToTimestampFunc { } } other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp", - other - ) + exec_err!("Unsupported data type {other} for function to_timestamp") } } } @@ -424,7 +421,7 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { } other => { exec_err!( - "Unsupported data type {:?} for function to_timestamp_seconds", + "Unsupported data type {} for function to_timestamp_seconds", other ) } @@ -482,7 +479,7 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { ), other => { exec_err!( - "Unsupported data type {:?} for function to_timestamp_millis", + "Unsupported data type {} for function to_timestamp_millis", other ) } @@ -540,7 +537,7 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { ), other => { exec_err!( - "Unsupported data type {:?} for function to_timestamp_micros", + "Unsupported data type {} for function to_timestamp_micros", other ) } @@ -597,7 +594,7 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { } other => { exec_err!( - "Unsupported data type {:?} for function to_timestamp_nanos", + "Unsupported data type {} for function to_timestamp_nanos", other ) } diff --git a/datafusion/functions/src/datetime/to_unixtime.rs b/datafusion/functions/src/datetime/to_unixtime.rs index 13c73815f6..42651cd537 100644 --- a/datafusion/functions/src/datetime/to_unixtime.rs +++ b/datafusion/functions/src/datetime/to_unixtime.rs @@ -118,7 +118,7 @@ impl ScalarUDFImpl for ToUnixtimeFunc { .invoke_with_args(args)? .cast_to(&DataType::Int64, None), other => { - exec_err!("Unsupported data type {:?} for function to_unixtime", other) + exec_err!("Unsupported data type {} for function to_unixtime", other) } } } diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index 1972535325..228d704e29 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -45,7 +45,7 @@ macro_rules! export_functions { ($(($FUNC:ident, $DOC:expr, $($arg:tt)*)),*) => { $( // switch to single-function cases below - export_functions!(single $FUNC, $DOC, $($arg)*); + $crate::export_functions!(single $FUNC, $DOC, $($arg)*); )* }; @@ -122,7 +122,7 @@ macro_rules! make_stub_package { macro_rules! downcast_named_arg { ($ARG:expr, $NAME:expr, $ARRAY_TYPE:ident) => {{ $ARG.as_any().downcast_ref::<$ARRAY_TYPE>().ok_or_else(|| { - internal_datafusion_err!( + datafusion_common::internal_datafusion_err!( "could not cast {} to {}", $NAME, std::any::type_name::<$ARRAY_TYPE>() @@ -139,7 +139,7 @@ macro_rules! downcast_named_arg { #[macro_export] macro_rules! downcast_arg { ($ARG:expr, $ARRAY_TYPE:ident) => {{ - downcast_named_arg!($ARG, "", $ARRAY_TYPE) + $crate::downcast_named_arg!($ARG, "", $ARRAY_TYPE) }}; } @@ -155,7 +155,7 @@ macro_rules! downcast_arg { /// $GET_DOC: the function to get the documentation of the UDF macro_rules! make_math_unary_udf { ($UDF:ident, $NAME:ident, $UNARY_FUNC:ident, $OUTPUT_ORDERING:expr, $EVALUATE_BOUNDS:expr, $GET_DOC:expr) => { - make_udf_function!($NAME::$UDF, $NAME); + $crate::make_udf_function!($NAME::$UDF, $NAME); mod $NAME { use std::any::Any; @@ -269,7 +269,7 @@ macro_rules! make_math_unary_udf { /// $GET_DOC: the function to get the documentation of the UDF macro_rules! make_math_binary_udf { ($UDF:ident, $NAME:ident, $BINARY_FUNC:ident, $OUTPUT_ORDERING:expr, $GET_DOC:expr) => { - make_udf_function!($NAME::$UDF, $NAME); + $crate::make_udf_function!($NAME::$UDF, $NAME); mod $NAME { use std::any::Any; diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 45c32e660e..8af8e4c2c8 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -26,9 +26,7 @@ use arrow::array::{ }; use arrow::datatypes::DataType; use arrow::error::ArrowError; -use datafusion_common::{ - internal_datafusion_err, not_impl_err, utils::take_function_args, Result, -}; +use datafusion_common::{not_impl_err, utils::take_function_args, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_expr::{ @@ -110,6 +108,14 @@ fn create_abs_function(input_data_type: &DataType) -> Result doc_section(label = "Math Functions"), description = "Returns the absolute value of a number.", syntax_example = "abs(numeric_expression)", + sql_example = r#"```sql +> SELECT abs(-5); ++----------+ +| abs(-5) | ++----------+ +| 5 | ++----------+ +```"#, standard_argument(name = "numeric_expression", prefix = "Numeric") )] #[derive(Debug, PartialEq, Eq, Hash)] diff --git a/datafusion/functions/src/math/cot.rs b/datafusion/functions/src/math/cot.rs index 8006be2eff..43f2012d07 100644 --- a/datafusion/functions/src/math/cot.rs +++ b/datafusion/functions/src/math/cot.rs @@ -32,6 +32,14 @@ use datafusion_macros::user_doc; doc_section(label = "Math Functions"), description = "Returns the cotangent of a number.", syntax_example = r#"cot(numeric_expression)"#, + sql_example = r#"```sql +> SELECT cot(1); ++---------+ +| cot(1) | ++---------+ +| 0.64209 | ++---------+ +```"#, standard_argument(name = "numeric_expression", prefix = "Numeric") )] #[derive(Debug, PartialEq, Eq, Hash)] diff --git a/datafusion/functions/src/math/factorial.rs b/datafusion/functions/src/math/factorial.rs index bd95f5b47b..79f6da94dd 100644 --- a/datafusion/functions/src/math/factorial.rs +++ b/datafusion/functions/src/math/factorial.rs @@ -26,9 +26,7 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use crate::utils::make_scalar_function; -use datafusion_common::{ - arrow_datafusion_err, exec_err, internal_datafusion_err, DataFusionError, Result, -}; +use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, @@ -39,6 +37,14 @@ use datafusion_macros::user_doc; doc_section(label = "Math Functions"), description = "Factorial. Returns 1 if value is less than 2.", syntax_example = "factorial(numeric_expression)", + sql_example = r#"```sql +> SELECT factorial(5); ++---------------+ +| factorial(5) | ++---------------+ +| 120 | ++---------------+ +```"#, standard_argument(name = "numeric_expression", prefix = "Numeric") )] #[derive(Debug, PartialEq, Eq, Hash)] diff --git a/datafusion/functions/src/math/gcd.rs b/datafusion/functions/src/math/gcd.rs index 714718c5e8..0b85e7b54a 100644 --- a/datafusion/functions/src/math/gcd.rs +++ b/datafusion/functions/src/math/gcd.rs @@ -34,6 +34,14 @@ use datafusion_macros::user_doc; doc_section(label = "Math Functions"), description = "Returns the greatest common divisor of `expression_x` and `expression_y`. Returns 0 if both inputs are zero.", syntax_example = "gcd(expression_x, expression_y)", + sql_example = r#"```sql +> SELECT gcd(48, 18); ++------------+ +| gcd(48,18) | ++------------+ +| 6 | ++------------+ +```"#, standard_argument(name = "expression_x", prefix = "First numeric"), standard_argument(name = "expression_y", prefix = "Second numeric") )] diff --git a/datafusion/functions/src/math/iszero.rs b/datafusion/functions/src/math/iszero.rs index ec1200f443..68cd3aca28 100644 --- a/datafusion/functions/src/math/iszero.rs +++ b/datafusion/functions/src/math/iszero.rs @@ -36,6 +36,14 @@ use crate::utils::make_scalar_function; doc_section(label = "Math Functions"), description = "Returns true if a given number is +0.0 or -0.0 otherwise returns false.", syntax_example = "iszero(numeric_expression)", + sql_example = r#"```sql +> SELECT iszero(0); ++------------+ +| iszero(0) | ++------------+ +| true | ++------------+ +```"#, standard_argument(name = "numeric_expression", prefix = "Numeric") )] #[derive(Debug, PartialEq, Eq, Hash)] diff --git a/datafusion/functions/src/math/lcm.rs b/datafusion/functions/src/math/lcm.rs index ca7dbb5856..bfb20dfd5c 100644 --- a/datafusion/functions/src/math/lcm.rs +++ b/datafusion/functions/src/math/lcm.rs @@ -23,9 +23,7 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::error::ArrowError; -use datafusion_common::{ - arrow_datafusion_err, exec_err, internal_datafusion_err, DataFusionError, Result, -}; +use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, @@ -39,6 +37,14 @@ use crate::utils::make_scalar_function; doc_section(label = "Math Functions"), description = "Returns the least common multiple of `expression_x` and `expression_y`. Returns 0 if either input is zero.", syntax_example = "lcm(expression_x, expression_y)", + sql_example = r#"```sql +> SELECT lcm(4, 5); ++----------+ +| lcm(4,5) | ++----------+ +| 20 | ++----------+ +```"#, standard_argument(name = "expression_x", prefix = "First numeric"), standard_argument(name = "expression_y", prefix = "Second numeric") )] diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index 6604f9ee22..ff1fd0cd4b 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -48,6 +48,14 @@ use datafusion_macros::user_doc; description = "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.", syntax_example = r#"log(base, numeric_expression) log(numeric_expression)"#, + sql_example = r#"```sql +> SELECT log(10); ++---------+ +| log(10) | ++---------+ +| 1.0 | ++---------+ +```"#, standard_argument(name = "base", prefix = "Base numeric"), standard_argument(name = "numeric_expression", prefix = "Numeric") )] diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs index baa3147f62..5b8252137b 100644 --- a/datafusion/functions/src/math/monotonicity.rs +++ b/datafusion/functions/src/math/monotonicity.rs @@ -18,8 +18,8 @@ use std::sync::LazyLock; use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_doc::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_expr::Documentation; @@ -45,6 +45,16 @@ static DOCUMENTATION_ACOS: LazyLock = LazyLock::new(|| { "acos(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT acos(1); ++----------+ +| acos(1) | ++----------+ +| 0.0 | ++----------+ +```"#, + ) .build() }); @@ -69,15 +79,24 @@ pub fn acosh_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ACOSH: LazyLock = LazyLock::new(|| { - Documentation::builder( +static DOCUMENTATION_ACOSH: LazyLock = + LazyLock::new(|| { + Documentation::builder( DOC_SECTION_MATH, "Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number.", "acosh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example(r#"```sql +> SELECT acosh(2); ++------------+ +| acosh(2) | ++------------+ +| 1.31696 | ++------------+ +```"#) .build() -}); + }); pub fn get_acosh_doc() -> &'static Documentation { &DOCUMENTATION_ACOSH @@ -105,6 +124,16 @@ static DOCUMENTATION_ASIN: LazyLock = LazyLock::new(|| { "asin(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT asin(0.5); ++------------+ +| asin(0.5) | ++------------+ +| 0.5235988 | ++------------+ +```"#, + ) .build() }); @@ -124,6 +153,16 @@ static DOCUMENTATION_ASINH: LazyLock = LazyLock::new(|| { "asinh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#" ```sql +> SELECT asinh(1); ++------------+ +| asinh(1) | ++------------+ +| 0.8813736 | ++------------+ +```"#, + ) .build() }); @@ -143,6 +182,16 @@ static DOCUMENTATION_ATAN: LazyLock = LazyLock::new(|| { "atan(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql + > SELECT atan(1); ++-----------+ +| atan(1) | ++-----------+ +| 0.7853982 | ++-----------+ +```"#, + ) .build() }); @@ -165,15 +214,24 @@ pub fn atanh_order(input: &[ExprProperties]) -> Result { } } -static DOCUMENTATION_ATANH: LazyLock = LazyLock::new(|| { - Documentation::builder( +static DOCUMENTATION_ATANH: LazyLock = + LazyLock::new(|| { + Documentation::builder( DOC_SECTION_MATH, "Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number.", "atanh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example(r#"```sql + > SELECT atanh(0.5); ++-------------+ +| atanh(0.5) | ++-------------+ +| 0.5493061 | ++-------------+ +```"#) .build() -}); + }); pub fn get_atanh_doc() -> &'static Documentation { &DOCUMENTATION_ATANH @@ -185,8 +243,9 @@ pub fn atan2_order(_input: &[ExprProperties]) -> Result { Ok(SortProperties::Unordered) } -static DOCUMENTATION_ATANH2: LazyLock = LazyLock::new(|| { - Documentation::builder( +static DOCUMENTATION_ATANH2: LazyLock = + LazyLock::new(|| { + Documentation::builder( DOC_SECTION_MATH, "Returns the arc tangent or inverse tangent of `expression_y / expression_x`.", "atan2(expression_y, expression_x)", @@ -201,8 +260,16 @@ Can be a constant, column, or function, and any combination of arithmetic operat r#"Second numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators."#, ) + .with_sql_example(r#"```sql +> SELECT atan2(1, 1); ++------------+ +| atan2(1,1) | ++------------+ +| 0.7853982 | ++------------+ +```"#) .build() -}); + }); pub fn get_atan2_doc() -> &'static Documentation { &DOCUMENTATION_ATANH2 @@ -220,6 +287,16 @@ static DOCUMENTATION_CBRT: LazyLock = LazyLock::new(|| { "cbrt(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT cbrt(27); ++-----------+ +| cbrt(27) | ++-----------+ +| 3.0 | ++-----------+ +```"#, + ) .build() }); @@ -239,6 +316,16 @@ static DOCUMENTATION_CEIL: LazyLock = LazyLock::new(|| { "ceil(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql + > SELECT ceil(3.14); ++------------+ +| ceil(3.14) | ++------------+ +| 4.0 | ++------------+ +```"#, + ) .build() }); @@ -260,6 +347,16 @@ static DOCUMENTATION_COS: LazyLock = LazyLock::new(|| { "cos(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT cos(0); ++--------+ +| cos(0) | ++--------+ +| 1.0 | ++--------+ +```"#, + ) .build() }); @@ -290,6 +387,16 @@ static DOCUMENTATION_COSH: LazyLock = LazyLock::new(|| { "cosh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT cosh(1); ++-----------+ +| cosh(1) | ++-----------+ +| 1.5430806 | ++-----------+ +```"#, + ) .build() }); @@ -309,6 +416,16 @@ static DOCUMENTATION_DEGREES: LazyLock = LazyLock::new(|| { "degrees(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql + > SELECT degrees(pi()); ++------------+ +| degrees(0) | ++------------+ +| 180.0 | ++------------+ +```"#, + ) .build() }); @@ -328,6 +445,16 @@ static DOCUMENTATION_EXP: LazyLock = LazyLock::new(|| { "exp(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT exp(1); ++---------+ +| exp(1) | ++---------+ +| 2.71828 | ++---------+ +```"#, + ) .build() }); @@ -347,6 +474,16 @@ static DOCUMENTATION_FLOOR: LazyLock = LazyLock::new(|| { "floor(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT floor(3.14); ++-------------+ +| floor(3.14) | ++-------------+ +| 3.0 | ++-------------+ +```"#, + ) .build() }); @@ -375,6 +512,16 @@ static DOCUMENTATION_LN: LazyLock = LazyLock::new(|| { "ln(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT ln(2.71828); ++-------------+ +| ln(2.71828) | ++-------------+ +| 1.0 | ++-------------+ +```"#, + ) .build() }); @@ -403,6 +550,16 @@ static DOCUMENTATION_LOG2: LazyLock = LazyLock::new(|| { "log2(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT log2(8); ++-----------+ +| log2(8) | ++-----------+ +| 3.0 | ++-----------+ +```"#, + ) .build() }); @@ -431,6 +588,16 @@ static DOCUMENTATION_LOG10: LazyLock = LazyLock::new(|| { "log10(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT log10(100); ++-------------+ +| log10(100) | ++-------------+ +| 2.0 | ++-------------+ +```"#, + ) .build() }); @@ -443,18 +610,28 @@ pub fn radians_order(input: &[ExprProperties]) -> Result { Ok(input[0].sort_properties) } -static DOCUMENTATION_RADIONS: LazyLock = LazyLock::new(|| { +static DOCUMENTATION_RADIANS: LazyLock = LazyLock::new(|| { Documentation::builder( DOC_SECTION_MATH, "Converts degrees to radians.", "radians(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT radians(180); ++----------------+ +| radians(180) | ++----------------+ +| 3.14159265359 | ++----------------+ +```"#, + ) .build() }); pub fn get_radians_doc() -> &'static Documentation { - &DOCUMENTATION_RADIONS + &DOCUMENTATION_RADIANS } /// Non-decreasing on \[0, π\] and then non-increasing on \[π, 2π\]. @@ -471,6 +648,16 @@ static DOCUMENTATION_SIN: LazyLock = LazyLock::new(|| { "sin(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT sin(0); ++----------+ +| sin(0) | ++----------+ +| 0.0 | ++----------+ +```"#, + ) .build() }); @@ -490,6 +677,16 @@ static DOCUMENTATION_SINH: LazyLock = LazyLock::new(|| { "sinh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT sinh(1); ++-----------+ +| sinh(1) | ++-----------+ +| 1.1752012 | ++-----------+ +```"#, + ) .build() }); @@ -539,6 +736,16 @@ static DOCUMENTATION_TAN: LazyLock = LazyLock::new(|| { "tan(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql +> SELECT tan(pi()/4); ++--------------+ +| tan(PI()/4) | ++--------------+ +| 1.0 | ++--------------+ +```"#, + ) .build() }); @@ -558,6 +765,16 @@ static DOCUMENTATION_TANH: LazyLock = LazyLock::new(|| { "tanh(numeric_expression)", ) .with_standard_argument("numeric_expression", Some("Numeric")) + .with_sql_example( + r#"```sql + > SELECT tanh(20); + +----------+ + | tanh(20) | + +----------+ + | 1.0 | + +----------+ + ```"#, + ) .build() }); diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index ef9b2eff20..759b0f5fd5 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -31,6 +31,14 @@ use std::sync::Arc; doc_section(label = "Math Functions"), description = "Returns true if a given number is +NaN or -NaN otherwise returns false.", syntax_example = "isnan(numeric_expression)", + sql_example = r#"```sql +> SELECT isnan(1); ++----------+ +| isnan(1) | ++----------+ +| false | ++----------+ +```"#, standard_argument(name = "numeric_expression", prefix = "Numeric") )] #[derive(Debug, PartialEq, Eq, Hash)] diff --git a/datafusion/functions/src/math/nanvl.rs b/datafusion/functions/src/math/nanvl.rs index 3d05a03e5a..f0835b4d48 100644 --- a/datafusion/functions/src/math/nanvl.rs +++ b/datafusion/functions/src/math/nanvl.rs @@ -36,6 +36,14 @@ use datafusion_macros::user_doc; description = r#"Returns the first argument if it's not _NaN_. Returns the second argument otherwise."#, syntax_example = "nanvl(expression_x, expression_y)", + sql_example = r#"```sql +> SELECT nanvl(0, 5); ++------------+ +| nanvl(0,5) | ++------------+ +| 0 | ++------------+ +```"#, argument( name = "expression_x", description = "Numeric expression to return if it's not _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators." diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs index 73325de4cf..ad2e795d08 100644 --- a/datafusion/functions/src/math/power.rs +++ b/datafusion/functions/src/math/power.rs @@ -24,8 +24,8 @@ use super::log::LogFunc; use arrow::array::{ArrayRef, AsArray, Int64Array}; use arrow::datatypes::{ArrowNativeTypeOp, DataType, Float64Type}; use datafusion_common::{ - arrow_datafusion_err, exec_datafusion_err, exec_err, internal_datafusion_err, - plan_datafusion_err, DataFusionError, Result, ScalarValue, + arrow_datafusion_err, exec_datafusion_err, exec_err, plan_datafusion_err, + DataFusionError, Result, ScalarValue, }; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; @@ -39,6 +39,14 @@ use datafusion_macros::user_doc; doc_section(label = "Math Functions"), description = "Returns a base expression raised to the power of an exponent.", syntax_example = "power(base, exponent)", + sql_example = r#"```sql +> SELECT power(2, 3); ++-------------+ +| power(2,3) | ++-------------+ +| 8 | ++-------------+ +```"#, standard_argument(name = "base", prefix = "Numeric"), standard_argument(name = "exponent", prefix = "Exponent numeric") )] diff --git a/datafusion/functions/src/math/random.rs b/datafusion/functions/src/math/random.rs index f9c4b198d1..d63e76a06d 100644 --- a/datafusion/functions/src/math/random.rs +++ b/datafusion/functions/src/math/random.rs @@ -32,7 +32,15 @@ use datafusion_macros::user_doc; doc_section(label = "Math Functions"), description = r#"Returns a random float value in the range [0, 1). The random seed is unique to each row."#, - syntax_example = "random()" + syntax_example = "random()", + sql_example = r#"```sql +> SELECT random(); ++------------------+ +| random() | ++------------------+ +| 0.7389238902938 | ++------------------+ +```"# )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct RandomFunc { diff --git a/datafusion/functions/src/math/round.rs b/datafusion/functions/src/math/round.rs index e13d6b8f9a..de5c0930e0 100644 --- a/datafusion/functions/src/math/round.rs +++ b/datafusion/functions/src/math/round.rs @@ -41,7 +41,15 @@ use datafusion_macros::user_doc; argument( name = "decimal_places", description = "Optional. The number of decimal places to round to. Defaults to 0." - ) + ), + sql_example = r#"```sql +> SELECT round(3.14159); ++--------------+ +| round(3.14159)| ++--------------+ +| 3.0 | ++--------------+ +```"# )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct RoundFunc { diff --git a/datafusion/functions/src/math/signum.rs b/datafusion/functions/src/math/signum.rs index 73931f303b..bbe6178f39 100644 --- a/datafusion/functions/src/math/signum.rs +++ b/datafusion/functions/src/math/signum.rs @@ -38,7 +38,15 @@ use crate::utils::make_scalar_function; Negative numbers return `-1`. Zero and positive numbers return `1`."#, syntax_example = "signum(numeric_expression)", - standard_argument(name = "numeric_expression", prefix = "Numeric") + standard_argument(name = "numeric_expression", prefix = "Numeric"), + sql_example = r#"```sql +> SELECT signum(-42); ++-------------+ +| signum(-42) | ++-------------+ +| -1 | ++-------------+ +```"# )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct SignumFunc { diff --git a/datafusion/functions/src/math/trunc.rs b/datafusion/functions/src/math/trunc.rs index c8e2de502b..9d1b4336f6 100644 --- a/datafusion/functions/src/math/trunc.rs +++ b/datafusion/functions/src/math/trunc.rs @@ -45,7 +45,16 @@ use datafusion_macros::user_doc; `decimal_places` is a positive integer, truncates digits to the right of the decimal point. If `decimal_places` is a negative integer, replaces digits to the left of the decimal point with `0`."# - ) + ), + sql_example = r#" + ```sql + > SELECT trunc(42.738); + +----------------+ + | trunc(42.738) | + +----------------+ + | 42 | + +----------------+ + ```"# )] #[derive(Debug, PartialEq, Eq, Hash)] pub struct TruncFunc { diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 39858119c8..ca3d19822e 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -24,7 +24,9 @@ use arrow::array::{new_null_array, ArrayIter, AsArray}; use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::array::{ArrayAccessor, StringViewArray}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_string_view_array; +use datafusion_common::cast::{ + as_large_string_array, as_string_array, as_string_view_array, +}; use datafusion_common::exec_err; use datafusion_common::plan_err; use datafusion_common::ScalarValue; @@ -95,13 +97,12 @@ impl Default for RegexpReplaceFunc { impl RegexpReplaceFunc { pub fn new() -> Self { use DataType::*; + use TypeSignature::*; Self { signature: Signature::one_of( vec![ - TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), - TypeSignature::Exact(vec![Utf8View, Utf8, Utf8]), - TypeSignature::Exact(vec![Utf8, Utf8, Utf8, Utf8]), - TypeSignature::Exact(vec![Utf8View, Utf8, Utf8, Utf8]), + Uniform(3, vec![Utf8View, LargeUtf8, Utf8]), + Uniform(4, vec![Utf8View, LargeUtf8, Utf8]), ], Volatility::Immutable, ), @@ -238,15 +239,14 @@ fn regex_replace_posix_groups(replacement: &str) -> String { /// # Ok(()) /// # } /// ``` -pub fn regexp_replace<'a, T: OffsetSizeTrait, V, B>( - string_array: V, - pattern_array: B, - replacement_array: B, - flags: Option<&ArrayRef>, +pub fn regexp_replace<'a, T: OffsetSizeTrait, U>( + string_array: U, + pattern_array: U, + replacement_array: U, + flags_array: Option, ) -> Result where - V: ArrayAccessor, - B: ArrayAccessor, + U: ArrayAccessor, { // Default implementation for regexp_replace, assumes all args are arrays // and args is a sequence of 3 or 4 elements. @@ -260,7 +260,7 @@ where let pattern_array_iter = ArrayIter::new(pattern_array); let replacement_array_iter = ArrayIter::new(replacement_array); - match flags { + match flags_array { None => { let result_iter = string_array_iter .zip(pattern_array_iter) @@ -307,13 +307,13 @@ where } } } - Some(flags) => { - let flags_array = as_generic_string_array::(flags)?; + Some(flags_array) => { + let flags_array_iter = ArrayIter::new(flags_array); let result_iter = string_array_iter .zip(pattern_array_iter) .zip(replacement_array_iter) - .zip(flags_array.iter()) + .zip(flags_array_iter) .map(|(((string, pattern), replacement), flags)| { match (string, pattern, replacement, flags) { (Some(string), Some(pattern), Some(replacement), Some(flags)) => { @@ -398,12 +398,37 @@ fn _regexp_replace_early_abort( /// Note: If the array is empty or the first argument is null, /// then calls the given early abort function. macro_rules! fetch_string_arg { - ($ARG:expr, $NAME:expr, $T:ident, $EARLY_ABORT:ident, $ARRAY_SIZE:expr) => {{ - let array = as_generic_string_array::<$T>($ARG)?; - if array.len() == 0 || array.is_null(0) { - return $EARLY_ABORT(array, $ARRAY_SIZE); - } else { - array.value(0) + ($ARG:expr, $NAME:expr, $EARLY_ABORT:ident, $ARRAY_SIZE:expr) => {{ + let string_array_type = ($ARG).data_type(); + match string_array_type { + DataType::Utf8 => { + let array = as_string_array($ARG)?; + if array.len() == 0 || array.is_null(0) { + return $EARLY_ABORT(array, $ARRAY_SIZE); + } else { + array.value(0) + } + } + DataType::LargeUtf8 => { + let array = as_large_string_array($ARG)?; + if array.len() == 0 || array.is_null(0) { + return $EARLY_ABORT(array, $ARRAY_SIZE); + } else { + array.value(0) + } + } + DataType::Utf8View => { + let array = as_string_view_array($ARG)?; + if array.len() == 0 || array.is_null(0) { + return $EARLY_ABORT(array, $ARRAY_SIZE); + } else { + array.value(0) + } + } + _ => unreachable!( + "Invalid data type for regexp_replace: {}", + string_array_type + ), } }}; } @@ -417,23 +442,17 @@ fn _regexp_replace_static_pattern_replace( args: &[ArrayRef], ) -> Result { let array_size = args[0].len(); - let pattern = fetch_string_arg!( - &args[1], - "pattern", - i32, - _regexp_replace_early_abort, - array_size - ); + let pattern = + fetch_string_arg!(&args[1], "pattern", _regexp_replace_early_abort, array_size); let replacement = fetch_string_arg!( &args[2], "replacement", - i32, _regexp_replace_early_abort, array_size ); let flags = match args.len() { 3 => None, - 4 => Some(fetch_string_arg!(&args[3], "flags", i32, _regexp_replace_early_abort, array_size)), + 4 => Some(fetch_string_arg!(&args[3], "flags", _regexp_replace_early_abort, array_size)), other => { return exec_err!( "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4." @@ -590,38 +609,61 @@ pub fn specialize_regexp_replace( .map(|arg| arg.to_array(inferred_length)) .collect::>>()?; - match args[0].data_type() { - DataType::Utf8View => { - let string_array = args[0].as_string_view(); + match ( + args[0].data_type(), + args[1].data_type(), + args[2].data_type(), + args.get(3).map(|a| a.data_type()), + ) { + ( + DataType::Utf8, + DataType::Utf8, + DataType::Utf8, + Some(DataType::Utf8) | None, + ) => { + let string_array = args[0].as_string::(); let pattern_array = args[1].as_string::(); let replacement_array = args[2].as_string::(); - regexp_replace::( + let flags_array = args.get(3).map(|a| a.as_string::()); + regexp_replace::( string_array, pattern_array, replacement_array, - args.get(3), + flags_array, ) } - DataType::Utf8 => { - let string_array = args[0].as_string::(); - let pattern_array = args[1].as_string::(); - let replacement_array = args[2].as_string::(); - regexp_replace::( + ( + DataType::Utf8View, + DataType::Utf8View, + DataType::Utf8View, + Some(DataType::Utf8View) | None, + ) => { + let string_array = args[0].as_string_view(); + let pattern_array = args[1].as_string_view(); + let replacement_array = args[2].as_string_view(); + let flags_array = args.get(3).map(|a| a.as_string_view()); + regexp_replace::( string_array, pattern_array, replacement_array, - args.get(3), + flags_array, ) } - DataType::LargeUtf8 => { + ( + DataType::LargeUtf8, + DataType::LargeUtf8, + DataType::LargeUtf8, + Some(DataType::LargeUtf8) | None, + ) => { let string_array = args[0].as_string::(); let pattern_array = args[1].as_string::(); let replacement_array = args[2].as_string::(); - regexp_replace::( + let flags_array = args.get(3).map(|a| a.as_string::()); + regexp_replace::( string_array, pattern_array, replacement_array, - args.get(3), + flags_array, ) } other => { @@ -650,8 +692,8 @@ mod tests { vec!["afooc", "acd", "afoocd1234567890123", "123456789012afooc"]; let values = <$T>::from(values); - let patterns = StringArray::from(patterns); - let replacements = StringArray::from(replacement); + let patterns = <$T>::from(patterns); + let replacements = <$T>::from(replacement); let expected = <$T>::from(expected); let re = _regexp_replace_static_pattern_replace::<$O>(&[ diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index e322c757ef..a93e70e714 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -140,7 +140,7 @@ impl ScalarUDFImpl for ConcatFunc { Some(Some(v)) => result.push_str(v), Some(None) => {} // null literal None => plan_err!( - "Concat function does not support scalar type {:?}", + "Concat function does not support scalar type {}", scalar )?, } diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 6372131659..7e50676933 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -140,7 +140,7 @@ fn contains(args: &[ArrayRef]) -> Result { } } else { exec_err!( - "Unsupported data type {:?}, {:?} for function `contains`.", + "Unsupported data type {}, {:?} for function `contains`.", args[0].data_type(), args[1].data_type() ) diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 8a2020c5a0..f127b452b2 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -145,7 +145,7 @@ impl ScalarUDFImpl for ReplaceFunc { } } else { exec_err!( - "Unsupported data type {:?}, {:?}, {:?} for function replace.", + "Unsupported data type {}, {:?}, {:?} for function replace.", data_types[0], data_types[1], data_types[2] diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index d42cd7153f..fe979720bc 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -19,6 +19,7 @@ name = "datafusion-macros" description = "Procedural macros for DataFusion query engine" keywords = ["datafusion", "query", "sql"] +readme = "README.md" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } @@ -40,6 +41,6 @@ path = "src/user_doc.rs" proc-macro = true [dependencies] -datafusion-expr = { workspace = true } -quote = "1.0.40" +datafusion-doc = { workspace = true } +quote = "1.0.41" syn = { version = "2.0.106", features = ["full"] } diff --git a/datafusion/macros/README.md b/datafusion/macros/README.md index c78c02f1ca..c45bba1423 100644 --- a/datafusion/macros/README.md +++ b/datafusion/macros/README.md @@ -17,15 +17,14 @@ under the License. --> -# DataFusion Window Function Common Library +# Apache DataFusion Macros -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains common macros used in DataFusion -Most projects should use the [`datafusion`] crate directly, which re-exports -this module. If you are already using the [`datafusion`] crate, there is no -reason to use this crate directly in your project as well. +Most projects should use the [`datafusion`] crate directly. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs index 31cf9bb1b7..90e4e47485 100644 --- a/datafusion/macros/src/user_doc.rs +++ b/datafusion/macros/src/user_doc.rs @@ -22,7 +22,7 @@ #![cfg_attr(docsrs, feature(doc_auto_cfg))] extern crate proc_macro; -use datafusion_expr::scalar_doc_sections::doc_sections_const; +use datafusion_doc::scalar_doc_sections::doc_sections_const; use proc_macro::TokenStream; use quote::quote; use syn::{parse_macro_input, DeriveInput, LitStr}; diff --git a/datafusion/optimizer/README.md b/datafusion/optimizer/README.md index 1c9b37e09f..a95ec4828b 100644 --- a/datafusion/optimizer/README.md +++ b/datafusion/optimizer/README.md @@ -17,7 +17,9 @@ under the License. --> -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +# Apache DataFusion Optimizer + +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains the DataFusion logical optimizer. Please see [Query Optimizer] in the Library User Guide for more information. @@ -26,6 +28,7 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion [query optimizer]: https://datafusion.apache.org/library-user-guide/query-optimizer.html diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index e268053781..fffa1cbe6f 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use datafusion_expr::binary::BinaryTypeCoercer; -use itertools::izip; +use itertools::{izip, Itertools as _}; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema}; @@ -252,7 +252,7 @@ impl<'a> TypeCoercionRewriter<'a> { if dt.is_integer() || dt.is_null() { expr.cast_to(&DataType::Int64, schema) } else { - plan_err!("Expected {expr_name} to be an integer or null, but got {dt:?}") + plan_err!("Expected {expr_name} to be an integer or null, but got {dt}") } } @@ -351,9 +351,10 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { .data; let expr_type = expr.get_type(self.schema)?; let subquery_type = new_plan.schema().field(0).data_type(); - let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(plan_datafusion_err!( - "expr type {expr_type:?} can't cast to {subquery_type:?} in InSubquery" - ), + let common_type = comparison_coercion(&expr_type, subquery_type).ok_or( + plan_datafusion_err!( + "expr type {expr_type} can't cast to {subquery_type} in InSubquery" + ), )?; let new_subquery = Subquery { subquery: Arc::new(new_plan), @@ -478,7 +479,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { get_coerce_type_for_list(&expr_data_type, &list_data_types); match result_type { None => plan_err!( - "Can not find compatible types to compare {expr_data_type:?} with {list_data_types:?}" + "Can not find compatible types to compare {expr_data_type} with [{}]", list_data_types.iter().join(", ") ), Some(coerced_type) => { // find the coerced type @@ -685,7 +686,7 @@ fn coerce_scalar_range_aware( // If type coercion fails, check if the largest type in family works: if let Some(largest_type) = get_widest_type_in_family(target_type) { coerce_scalar(largest_type, value).map_or_else( - |_| exec_err!("Cannot cast {value:?} to {target_type:?}"), + |_| exec_err!("Cannot cast {value:?} to {target_type}"), |_| ScalarValue::try_from(target_type), ) } else { @@ -737,7 +738,7 @@ fn extract_window_frame_target_type(col_type: &DataType) -> Result { } else if let DataType::Dictionary(_, value_type) = col_type { extract_window_frame_target_type(value_type) } else { - internal_err!("Cannot run range queries on datatype: {col_type:?}") + internal_err!("Cannot run range queries on datatype: {col_type}") } } @@ -896,8 +897,9 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { get_coerce_type_for_case_expression(&when_types, Some(case_type)); coerced_type.ok_or_else(|| { plan_datafusion_err!( - "Failed to coerce case ({case_type:?}) and when ({when_types:?}) \ - to common types in CASE WHEN expression" + "Failed to coerce case ({case_type}) and when ({}) \ + to common types in CASE WHEN expression", + when_types.iter().join(", ") ) }) }) @@ -905,10 +907,19 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { let then_else_coerce_type = get_coerce_type_for_case_expression(&then_types, else_type.as_ref()).ok_or_else( || { - plan_datafusion_err!( - "Failed to coerce then ({then_types:?}) and else ({else_type:?}) \ - to common types in CASE WHEN expression" - ) + if let Some(else_type) = else_type { + plan_datafusion_err!( + "Failed to coerce then ({}) and else ({else_type}) \ + to common types in CASE WHEN expression", + then_types.iter().join(", ") + ) + } else { + plan_datafusion_err!( + "Failed to coerce then ({}) and else (None) \ + to common types in CASE WHEN expression", + then_types.iter().join(", ") + ) + } }, )?; @@ -1681,7 +1692,7 @@ mod test { let err = Projection::try_new(vec![udaf], empty).err().unwrap(); assert!( - err.strip_backtrace().starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'MY_AVG' function: coercion from [Utf8] to the signature Uniform(1, [Float64]) failed") + err.strip_backtrace().starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'MY_AVG' function: coercion from Utf8 to the signature Uniform(1, [Float64]) failed") ); Ok(()) } @@ -1742,7 +1753,7 @@ mod test { .err() .unwrap() .strip_backtrace(); - assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from [Utf8] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed")); + assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from Utf8 to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed")); Ok(()) } @@ -2231,7 +2242,7 @@ mod test { let err = coerce_case_expression(case, &schema).unwrap_err(); assert_snapshot!( err.strip_backtrace(), - @"Error during planning: Failed to coerce case (Interval(MonthDayNano)) and when ([Float32, Binary, Utf8]) to common types in CASE WHEN expression" + @"Error during planning: Failed to coerce case (Interval(MonthDayNano)) and when (Float32, Binary, Utf8) to common types in CASE WHEN expression" ); let case = Case { @@ -2246,7 +2257,7 @@ mod test { let err = coerce_case_expression(case, &schema).unwrap_err(); assert_snapshot!( err.strip_backtrace(), - @"Error during planning: Failed to coerce then ([Date32, Float32, Binary]) and else (Some(Timestamp(Nanosecond, None))) to common types in CASE WHEN expression" + @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(Nanosecond, None)) to common types in CASE WHEN expression" ); Ok(()) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index d78819c7c3..ec1f8f991a 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -316,6 +316,19 @@ impl CommonSubexprEliminate { } => { let rewritten_aggr_expr = new_exprs_list.pop().unwrap(); let new_aggr_expr = original_exprs_list.pop().unwrap(); + let saved_names = if let Some(aggr_expr) = aggr_expr { + let name_preserver = NamePreserver::new_for_projection(); + aggr_expr + .iter() + .map(|expr| Some(name_preserver.save(expr))) + .collect::>() + } else { + new_aggr_expr + .clone() + .into_iter() + .map(|_| None) + .collect::>() + }; let mut agg_exprs = common_exprs .into_iter() @@ -326,10 +339,19 @@ impl CommonSubexprEliminate { for expr in &new_group_expr { extract_expressions(expr, &mut proj_exprs) } - for (expr_rewritten, expr_orig) in - rewritten_aggr_expr.into_iter().zip(new_aggr_expr) + for ((expr_rewritten, expr_orig), saved_name) in + rewritten_aggr_expr + .into_iter() + .zip(new_aggr_expr) + .zip(saved_names) { if expr_rewritten == expr_orig { + let expr_rewritten = if let Some(saved_name) = saved_name + { + saved_name.restore(expr_rewritten) + } else { + expr_rewritten + }; if let Expr::Alias(Alias { expr, name, .. }) = expr_rewritten { diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs index 55cf33ef43..c76de942de 100644 --- a/datafusion/optimizer/src/extract_equijoin_predicate.rs +++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs @@ -19,8 +19,8 @@ use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; -use datafusion_common::DFSchema; -use datafusion_common::Result; +use datafusion_common::{internal_err, DFSchema}; +use datafusion_common::{NullEquality, Result}; use datafusion_expr::utils::split_conjunction_owned; use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair}; use datafusion_expr::{BinaryExpr, Expr, ExprSchemable, Join, LogicalPlan, Operator}; @@ -82,6 +82,45 @@ impl OptimizerRule for ExtractEquijoinPredicate { let (equijoin_predicates, non_equijoin_expr) = split_eq_and_noneq_join_predicate(expr, left_schema, right_schema)?; + // Equi-join operators like HashJoin support a special behavior + // that evaluates `NULL = NULL` as true instead of NULL. Therefore, + // we transform `t1.c1 IS NOT DISTINCT FROM t2.c1` into an equi-join + // and set the `NullEquality` configuration in the join operator. + // This allows certain queries to use Hash Join instead of + // Nested Loop Join, resulting in better performance. + // + // Only convert when there are NO equijoin predicates, to be conservative. + if on.is_empty() + && equijoin_predicates.is_empty() + && non_equijoin_expr.is_some() + { + // SAFETY: checked in the outer `if` + let expr = non_equijoin_expr.clone().unwrap(); + let (equijoin_predicates, non_equijoin_expr) = + split_is_not_distinct_from_and_other_join_predicate( + expr, + left_schema, + right_schema, + )?; + + if !equijoin_predicates.is_empty() { + on.extend(equijoin_predicates); + + return Ok(Transformed::yes(LogicalPlan::Join(Join { + left, + right, + on, + filter: non_equijoin_expr, + join_type, + join_constraint, + schema, + // According to `is not distinct from`'s semantics, it's + // safe to override it + null_equality: NullEquality::NullEqualsNull, + }))); + } + } + if !equijoin_predicates.is_empty() { on.extend(equijoin_predicates); Ok(Transformed::yes(LogicalPlan::Join(Join { @@ -112,22 +151,98 @@ impl OptimizerRule for ExtractEquijoinPredicate { } } +/// Splits an ANDed filter expression into equijoin predicates and remaining filters. +/// Returns all equijoin predicates and the remaining filters combined with AND. +/// +/// # Example +/// +/// For the expression `a.id = b.id AND a.x > 10 AND b.x > b.id`, this function will extract `a.id = b.id` as an equijoin predicate. +/// +/// It first splits the ANDed sub-expressions: +/// - expr1: a.id = b.id +/// - expr2: a.x > 10 +/// - expr3: b.x > b.id +/// +/// Then, it filters out the equijoin predicates and collects the non-equality expressions. +/// The equijoin condition is: +/// - It is an equality expression like `lhs == rhs` +/// - All column references in `lhs` are from the left schema, and all in `rhs` are from the right schema +/// +/// According to the above rule, `expr1` is the equijoin predicate, while `expr2` and `expr3` are not. +/// The function returns Ok(\[expr1\], Some(expr2 AND expr3)) fn split_eq_and_noneq_join_predicate( filter: Expr, left_schema: &DFSchema, right_schema: &DFSchema, ) -> Result<(Vec, Option)> { + split_op_and_other_join_predicates(filter, left_schema, right_schema, Operator::Eq) +} + +/// See `split_eq_and_noneq_join_predicate`'s comment for the idea. This function +/// is splitting out `is not distinct from` expressions instead of equal exprs. +/// The `is not distinct from` exprs will be return as `EquijoinPredicate`. +/// +/// # Example +/// - Input: `a.id IS NOT DISTINCT FROM b.id AND a.x > 10 AND b.x > b.id` +/// - Output from this splitter: `Ok([a.id, b.id], Some((a.x > 10) AND (b.x > b.id)))` +/// +/// # Note +/// Caller should be cautious -- `is not distinct from` is not equivalent to an +/// equal expression; the caller is responsible for correctly setting the +/// `nulls equals nulls` property in the join operator (if it supports it) to +/// make the transformation valid. +/// +/// For the above example: in downstream, a valid plan that uses the extracted +/// equijoin keys should look like: +/// +/// HashJoin +/// - on: `a.id = b.id` (equality) +/// - join_filter: `(a.x > 10) AND (b.x > b.id)` +/// - nulls_equals_null: `true` +/// +/// This reflects that `IS NOT DISTINCT FROM` treats `NULL = NULL` as true and +/// thus requires setting `NullEquality::NullEqualsNull` in the join operator to +/// preserve semantics while enabling an equi-join implementation (e.g., HashJoin). +fn split_is_not_distinct_from_and_other_join_predicate( + filter: Expr, + left_schema: &DFSchema, + right_schema: &DFSchema, +) -> Result<(Vec, Option)> { + split_op_and_other_join_predicates( + filter, + left_schema, + right_schema, + Operator::IsNotDistinctFrom, + ) +} + +/// See comments in `split_eq_and_noneq_join_predicate` for details. +fn split_op_and_other_join_predicates( + filter: Expr, + left_schema: &DFSchema, + right_schema: &DFSchema, + operator: Operator, +) -> Result<(Vec, Option)> { + if !matches!(operator, Operator::Eq | Operator::IsNotDistinctFrom) { + return internal_err!( + "split_op_and_other_join_predicates only supports 'Eq' or 'IsNotDistinctFrom' operators, \ + but received: {:?}", + operator + ); + } + let exprs = split_conjunction_owned(filter); + // Treat 'is not distinct from' comparison as join key in equal joins let mut accum_join_keys: Vec<(Expr, Expr)> = vec![]; let mut accum_filters: Vec = vec![]; for expr in exprs { match expr { Expr::BinaryExpr(BinaryExpr { ref left, - op: Operator::Eq, + ref op, ref right, - }) => { + }) if *op == operator => { let join_key_pair = find_valid_equijoin_key_pair(left, right, left_schema, right_schema)?; diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 49806d6db3..084152d40e 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -134,10 +134,15 @@ impl OptimizerContext { let mut options = ConfigOptions::default(); options.optimizer.filter_null_join_keys = true; + Self::new_with_config_options(Arc::new(options)) + } + + /// Create a optimizer config with provided [ConfigOptions]. + pub fn new_with_config_options(options: Arc) -> Self { Self { query_execution_start_time: Utc::now(), alias_generator: Arc::new(AliasGenerator::new()), - options: Arc::new(options), + options, } } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 5a3d57d65a..a8251d6690 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -978,8 +978,11 @@ impl OptimizerRule for PushDownFilter { let group_expr_columns = agg .group_expr .iter() - .map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string()))) - .collect::>>()?; + .map(|e| { + let (relation, name) = e.qualified_name(); + Column::new(relation, name) + }) + .collect::>(); let predicates = split_conjunction_owned(filter.predicate); @@ -1047,7 +1050,10 @@ impl OptimizerRule for PushDownFilter { func.params .partition_by .iter() - .map(|c| Column::from_qualified_name(c.schema_name().to_string())) + .map(|c| { + let (relation, name) = c.qualified_name(); + Column::new(relation, name) + }) .collect::>() }; let potential_partition_keys = window @@ -1567,6 +1573,30 @@ mod tests { ) } + /// verifies that filters with unusual column names are pushed down through aggregate operators + #[test] + fn filter_move_agg_special() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("$a", DataType::UInt32, false), + Field::new("$b", DataType::UInt32, false), + Field::new("$c", DataType::UInt32, false), + ]); + let table_scan = table_scan(Some("test"), &schema, None)?.build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("$a")], vec![sum(col("$b")).alias("total_salary")])? + .filter(col("$a").gt(lit(10i64)))? + .build()?; + // filter of key aggregation is commutative + assert_optimized_plan_equal!( + plan, + @r" + Aggregate: groupBy=[[test.$a]], aggr=[[sum(test.$b) AS total_salary]] + TableScan: test, full_filters=[test.$a > Int64(10)] + " + ) + } + #[test] fn filter_complex_group_by() -> Result<()> { let table_scan = test_table_scan()?; @@ -1647,6 +1677,41 @@ mod tests { ) } + /// verifies that filters with unusual identifier names are pushed down through window functions + #[test] + fn filter_window_special_identifier() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("$a", DataType::UInt32, false), + Field::new("$b", DataType::UInt32, false), + Field::new("$c", DataType::UInt32, false), + ]); + let table_scan = table_scan(Some("test"), &schema, None)?.build()?; + + let window = Expr::from(WindowFunction::new( + WindowFunctionDefinition::WindowUDF( + datafusion_functions_window::rank::rank_udwf(), + ), + vec![], + )) + .partition_by(vec![col("$a"), col("$b")]) + .order_by(vec![col("$c").sort(true, true)]) + .build() + .unwrap(); + + let plan = LogicalPlanBuilder::from(table_scan) + .window(vec![window])? + .filter(col("$b").gt(lit(10i64)))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + WindowAggr: windowExpr=[[rank() PARTITION BY [test.$a, test.$b] ORDER BY [test.$c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] + TableScan: test, full_filters=[test.$b > Int64(10)] + " + ) + } + /// verifies that when partitioning by 'a' and 'b', and filtering by 'a' and 'b', both 'a' and /// 'b' are pushed #[test] diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3c96f953f0..b491a3529f 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1399,6 +1399,41 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // Rules for Case // + // Inline a comparison to a literal with the case statement into the `THEN` clauses. + // which can enable further simplifications + // CASE WHEN X THEN "a" WHEN Y THEN "b" ... END = "a" --> CASE WHEN X THEN "a" = "a" WHEN Y THEN "b" = "a" END + Expr::BinaryExpr(BinaryExpr { + left, + op: op @ (Eq | NotEq), + right, + }) if is_case_with_literal_outputs(&left) && is_lit(&right) => { + let case = into_case(*left)?; + Transformed::yes(Expr::Case(Case { + expr: None, + when_then_expr: case + .when_then_expr + .into_iter() + .map(|(when, then)| { + ( + when, + Box::new(Expr::BinaryExpr(BinaryExpr { + left: then, + op, + right: right.clone(), + })), + ) + }) + .collect(), + else_expr: case.else_expr.map(|els| { + Box::new(Expr::BinaryExpr(BinaryExpr { + left: els, + op, + right, + })) + }), + })) + } + // CASE WHEN true THEN A ... END --> A // CASE WHEN X THEN A WHEN TRUE THEN B ... END --> CASE WHEN X THEN A ELSE B END Expr::Case(Case { @@ -1447,7 +1482,11 @@ impl TreeNodeRewriter for Simplifier<'_, S> { when_then_expr, else_expr, }) if !when_then_expr.is_empty() - && when_then_expr.len() < 3 // The rewrite is O(n²) so limit to small number + // The rewrite is O(n²) in general so limit to small number of when-thens that can be true + && (when_then_expr.len() < 3 // small number of input whens + // or all thens are literal bools and a small number of them are true + || (when_then_expr.iter().all(|(_, then)| is_bool_lit(then)) + && when_then_expr.iter().filter(|(_, then)| is_true(then)).count() < 3)) && info.is_boolean_type(&when_then_expr[0].1)? => { // String disjunction of all the when predicates encountered so far. Not nullable. @@ -1471,6 +1510,55 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // Do a first pass at simplification out_expr.rewrite(self)? } + // CASE + // WHEN X THEN true + // WHEN Y THEN true + // WHEN Z THEN false + // ... + // ELSE true + // END + // + // ---> + // + // NOT(CASE + // WHEN X THEN false + // WHEN Y THEN false + // WHEN Z THEN true + // ... + // ELSE false + // END) + // + // Note: the rationale for this rewrite is that the case can then be further + // simplified into a small number of ANDs and ORs + Expr::Case(Case { + expr: None, + when_then_expr, + else_expr, + }) if !when_then_expr.is_empty() + && when_then_expr + .iter() + .all(|(_, then)| is_bool_lit(then)) // all thens are literal bools + // This simplification is only helpful if we end up with a small number of true thens + && when_then_expr + .iter() + .filter(|(_, then)| is_false(then)) + .count() + < 3 + && else_expr.as_deref().is_none_or(is_bool_lit) => + { + Transformed::yes( + Expr::Case(Case { + expr: None, + when_then_expr: when_then_expr + .into_iter() + .map(|(when, then)| (when, Box::new(Expr::Not(then)))) + .collect(), + else_expr: else_expr + .map(|else_expr| Box::new(Expr::Not(else_expr))), + }) + .not(), + ) + } Expr::ScalarFunction(ScalarFunction { func: udf, args }) => { match udf.simplify(args, info)? { ExprSimplifyResult::Original(args) => { @@ -1843,7 +1931,7 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal let Some(value) = try_cast_literal_to_type(&right_lit_value, &expr_type) else { internal_err!( - "Can't cast the list expr {:?} to type {:?}", + "Can't cast the list expr {:?} to type {}", right_lit_value, &expr_type )? }; @@ -3465,6 +3553,142 @@ mod tests { ); } + #[test] + fn simplify_literal_case_equality() { + // CASE WHEN c2 != false THEN "ok" ELSE "not_ok" + let simple_case = Expr::Case(Case::new( + None, + vec![( + Box::new(col("c2_non_null").not_eq(lit(false))), + Box::new(lit("ok")), + )], + Some(Box::new(lit("not_ok"))), + )); + + // CASE WHEN c2 != false THEN "ok" ELSE "not_ok" == "ok" + // --> + // CASE WHEN c2 != false THEN "ok" == "ok" ELSE "not_ok" == "ok" + // --> + // CASE WHEN c2 != false THEN true ELSE false + // --> + // c2 + assert_eq!( + simplify(binary_expr(simple_case.clone(), Operator::Eq, lit("ok"),)), + col("c2_non_null"), + ); + + // CASE WHEN c2 != false THEN "ok" ELSE "not_ok" != "ok" + // --> + // NOT(CASE WHEN c2 != false THEN "ok" == "ok" ELSE "not_ok" == "ok") + // --> + // NOT(CASE WHEN c2 != false THEN true ELSE false) + // --> + // NOT(c2) + assert_eq!( + simplify(binary_expr(simple_case, Operator::NotEq, lit("ok"),)), + not(col("c2_non_null")), + ); + + let complex_case = Expr::Case(Case::new( + None, + vec![ + ( + Box::new(col("c1").eq(lit("inboxed"))), + Box::new(lit("pending")), + ), + ( + Box::new(col("c1").eq(lit("scheduled"))), + Box::new(lit("pending")), + ), + ( + Box::new(col("c1").eq(lit("completed"))), + Box::new(lit("completed")), + ), + ( + Box::new(col("c1").eq(lit("paused"))), + Box::new(lit("paused")), + ), + (Box::new(col("c2")), Box::new(lit("running"))), + ( + Box::new(col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0)))), + Box::new(lit("backing-off")), + ), + ], + Some(Box::new(lit("ready"))), + )); + + assert_eq!( + simplify(binary_expr( + complex_case.clone(), + Operator::Eq, + lit("completed"), + )), + not_distinct_from(col("c1").eq(lit("completed")), lit(true)).and( + distinct_from(col("c1").eq(lit("inboxed")), lit(true)) + .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true))) + ) + ); + + assert_eq!( + simplify(binary_expr( + complex_case.clone(), + Operator::NotEq, + lit("completed"), + )), + distinct_from(col("c1").eq(lit("completed")), lit(true)) + .or(not_distinct_from(col("c1").eq(lit("inboxed")), lit(true)) + .or(not_distinct_from(col("c1").eq(lit("scheduled")), lit(true)))) + ); + + assert_eq!( + simplify(binary_expr( + complex_case.clone(), + Operator::Eq, + lit("running"), + )), + not_distinct_from(col("c2"), lit(true)).and( + distinct_from(col("c1").eq(lit("inboxed")), lit(true)) + .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true))) + .and(distinct_from(col("c1").eq(lit("completed")), lit(true))) + .and(distinct_from(col("c1").eq(lit("paused")), lit(true))) + ) + ); + + assert_eq!( + simplify(binary_expr( + complex_case.clone(), + Operator::Eq, + lit("ready"), + )), + distinct_from(col("c1").eq(lit("inboxed")), lit(true)) + .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true))) + .and(distinct_from(col("c1").eq(lit("completed")), lit(true))) + .and(distinct_from(col("c1").eq(lit("paused")), lit(true))) + .and(distinct_from(col("c2"), lit(true))) + .and(distinct_from( + col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0))), + lit(true) + )) + ); + + assert_eq!( + simplify(binary_expr( + complex_case.clone(), + Operator::NotEq, + lit("ready"), + )), + not_distinct_from(col("c1").eq(lit("inboxed")), lit(true)) + .or(not_distinct_from(col("c1").eq(lit("scheduled")), lit(true))) + .or(not_distinct_from(col("c1").eq(lit("completed")), lit(true))) + .or(not_distinct_from(col("c1").eq(lit("paused")), lit(true))) + .or(not_distinct_from(col("c2"), lit(true))) + .or(not_distinct_from( + col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0))), + lit(true) + )) + ); + } + #[test] fn simplify_expr_case_when_then_else() { // CASE WHEN c2 != false THEN "ok" == "not_ok" ELSE c2 == true diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs index eef45b4c18..5286cbd7bd 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs @@ -90,7 +90,7 @@ pub(super) fn unwrap_cast_in_comparison_for_binary( // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal let Some(value) = try_cast_literal_to_type(&lit_value, &expr_type) else { return internal_err!( - "Can't cast the literal expr {:?} to type {:?}", + "Can't cast the literal expr {:?} to type {}", &lit_value, &expr_type ); diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs index 2f7dadceba..35e256f306 100644 --- a/datafusion/optimizer/src/simplify_expressions/utils.rs +++ b/datafusion/optimizer/src/simplify_expressions/utils.rs @@ -22,7 +22,7 @@ use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::{ expr::{Between, BinaryExpr, InList}, expr_fn::{and, bitwise_and, bitwise_or, or}, - Expr, Like, Operator, + Case, Expr, Like, Operator, }; pub static POWS_OF_TEN: [i128; 38] = [ @@ -265,6 +265,31 @@ pub fn as_bool_lit(expr: &Expr) -> Result> { } } +pub fn is_case_with_literal_outputs(expr: &Expr) -> bool { + match expr { + Expr::Case(Case { + expr: None, + when_then_expr, + else_expr, + }) => { + when_then_expr.iter().all(|(_, then)| is_lit(then)) + && else_expr.as_deref().is_none_or(is_lit) + } + _ => false, + } +} + +pub fn into_case(expr: Expr) -> Result { + match expr { + Expr::Case(case) => Ok(case), + _ => internal_err!("Expected case, got {expr:?}"), + } +} + +pub fn is_lit(expr: &Expr) -> bool { + matches!(expr, Expr::Literal(_, _)) +} + /// negate a Not clause /// input is the clause to be negated.(args of Not clause) /// For BinaryExpr, use the negation of op instead. diff --git a/datafusion/physical-expr-adapter/README.md b/datafusion/physical-expr-adapter/README.md index beecd53875..02bc144c16 100644 --- a/datafusion/physical-expr-adapter/README.md +++ b/datafusion/physical-expr-adapter/README.md @@ -1,4 +1,25 @@ -# DataFusion Physical Expression Adapter + + +# Apache DataFusion Physical Expression Adapter + +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate provides utilities for adapting physical expressions to different schemas in DataFusion. @@ -6,3 +27,12 @@ It handles schema differences in file scans by rewriting expressions to match th including type casting, missing columns, and partition values. For detailed documentation, see the [`PhysicalExprAdapter`] trait documentation. + +Most projects should use the [`datafusion`] crate directly, which re-exports +this module. If you are already using the [`datafusion`] crate, there is no +reason to use this crate directly in your project as well. + +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[`datafusion`]: https://crates.io/crates/datafusion +[`physicalexpradapter`]: https://docs.rs/datafusion/latest/datafusion/physical_expr_adapter/trait.PhysicalExprAdapter.html diff --git a/datafusion/physical-expr-common/README.md b/datafusion/physical-expr-common/README.md index fab03fb497..c318e74681 100644 --- a/datafusion/physical-expr-common/README.md +++ b/datafusion/physical-expr-common/README.md @@ -17,16 +17,19 @@ under the License. --> -# DataFusion Core Physical Expressions +# Apache DataFusion Core Physical Expressions -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides shared APIs for implementing -physical expressions such as `PhysicalExpr` and `PhysicalSortExpr`. +physical expressions such as [`PhysicalExpr`] and [`PhysicalSortExpr`]. Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion +[`physicalexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/trait.PhysicalExpr.html +[`physicalsortexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/struct.PhysicalSortExpr.html diff --git a/datafusion/physical-expr/README.md b/datafusion/physical-expr/README.md index b99f3c4946..4c79223b09 100644 --- a/datafusion/physical-expr/README.md +++ b/datafusion/physical-expr/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Physical Expressions +# Apache DataFusion Physical Expressions -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that provides data types and utilities for physical expressions. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index abc963355b..59fed88b14 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -160,7 +160,7 @@ fn boolean_op( left: &dyn Array, right: &dyn Array, op: impl FnOnce(&BooleanArray, &BooleanArray) -> Result, -) -> Result, ArrowError> { +) -> Result, ArrowError> { let ll = as_boolean_array(left).expect("boolean_op failed to downcast left array"); let rr = as_boolean_array(right).expect("boolean_op failed to downcast right array"); op(ll, rr).map(|t| Arc::new(t) as _) @@ -179,7 +179,7 @@ macro_rules! binary_string_array_flag_op { compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) }, other => internal_err!( - "Data type {:?} not supported for binary_string_array_flag_op operation '{}' on string array", + "Data type {} not supported for binary_string_array_flag_op operation '{}' on string array", other, stringify!($OP) ), } @@ -258,7 +258,7 @@ macro_rules! binary_string_array_flag_op_scalar { DataType::Utf8View => compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT, $FLAG), DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG), other => internal_err!( - "Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array", + "Data type {} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array", other, stringify!($OP) ), }.map( @@ -273,7 +273,7 @@ macro_rules! binary_string_array_flag_op_scalar { ) }, other => internal_err!( - "Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array", + "Data type {} not supported for binary_string_array_flag_op_scalar operation '{}' on string array", other, stringify!($OP) ), }; @@ -731,7 +731,7 @@ fn to_result_type_array( Ok(cast(&array, result_type)?) } else { internal_err!( - "Incompatible Dictionary value type {value_type:?} with result type {result_type:?} of Binary operator {op:?}" + "Incompatible Dictionary value type {value_type} with result type {result_type} of Binary operator {op:?}" ) } } diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index ae26f3e842..d2553146cb 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -71,7 +71,7 @@ macro_rules! create_dyn_kernel { call_bitwise_kernel!(left, right, $KERNEL, UInt64Array) } other => plan_err!( - "Data type {:?} not supported for binary operation '{}' on dyn arrays", + "Data type {} not supported for binary operation '{}' on dyn arrays", other, stringify!($KERNEL) ), @@ -117,7 +117,7 @@ macro_rules! create_dyn_scalar_kernel { DataType::UInt32 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32), DataType::UInt64 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64), other => plan_err!( - "Data type {:?} not supported for binary operation '{}' on dyn arrays", + "Data type {} not supported for binary operation '{}' on dyn arrays", other, stringify!($KERNEL) ), diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index c91678317b..c1fd201e10 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -232,7 +232,7 @@ pub fn cast_with_options( } else if can_cast_types(&expr_type, &cast_type) { Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) } else { - not_impl_err!("Unsupported CAST from {expr_type:?} to {cast_type:?}") + not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}") } } diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index eeb0c6e802..a53b32c976 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -98,8 +98,7 @@ impl Eq for DynamicFilterPhysicalExpr {} impl Display for DynamicFilterPhysicalExpr { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let inner = self.current().expect("Failed to get current expression"); - write!(f, "DynamicFilterPhysicalExpr [ {inner} ]") + self.render(f, |expr, f| write!(f, "{expr}")) } } @@ -173,6 +172,11 @@ impl DynamicFilterPhysicalExpr { } } + /// Get the current generation of the expression. + fn current_generation(&self) -> u64 { + self.inner.read().generation + } + /// Get the current expression. /// This will return the current expression with any children /// remapped to match calls to [`PhysicalExpr::with_new_children`]. @@ -206,6 +210,26 @@ impl DynamicFilterPhysicalExpr { }; Ok(()) } + + fn render( + &self, + f: &mut std::fmt::Formatter<'_>, + render_expr: impl FnOnce( + Arc, + &mut std::fmt::Formatter<'_>, + ) -> std::fmt::Result, + ) -> std::fmt::Result { + let inner = self.current().map_err(|_| std::fmt::Error)?; + let current_generation = self.current_generation(); + write!(f, "DynamicFilter [ ")?; + if current_generation == 1 { + write!(f, "empty")?; + } else { + render_expr(inner, f)?; + } + + write!(f, " ]") + } } impl PhysicalExpr for DynamicFilterPhysicalExpr { @@ -295,8 +319,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let inner = self.current().map_err(|_| std::fmt::Error)?; - inner.fmt_sql(f) + self.render(f, |expr, f| expr.fmt_sql(f)) } fn snapshot(&self) -> Result>> { @@ -314,7 +337,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { mod test { use crate::{ expressions::{col, lit, BinaryExpr}, - utils::reassign_predicate_columns, + utils::reassign_expr_columns, }; use arrow::{ array::RecordBatch, @@ -352,18 +375,16 @@ mod test { ])); // Each ParquetExec calls `with_new_children` on the DynamicFilterPhysicalExpr // and remaps the children to the file schema. - let dynamic_filter_1 = reassign_predicate_columns( + let dynamic_filter_1 = reassign_expr_columns( Arc::clone(&dynamic_filter) as Arc, &filter_schema_1, - false, ) .unwrap(); let snap = dynamic_filter_1.snapshot().unwrap().unwrap(); insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#); - let dynamic_filter_2 = reassign_predicate_columns( + let dynamic_filter_2 = reassign_expr_columns( Arc::clone(&dynamic_filter) as Arc, &filter_schema_2, - false, ) .unwrap(); let snap = dynamic_filter_2.snapshot().unwrap().unwrap(); diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index b6fe84ea51..597ebbdd12 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -470,6 +470,7 @@ mod tests { use datafusion_common::plan_err; use datafusion_expr::type_coercion::binary::comparison_coercion; use datafusion_physical_expr_common::physical_expr::fmt_sql; + use itertools::Itertools as _; type InListCastResult = (Arc, Vec>); @@ -488,7 +489,8 @@ mod tests { let result_type = get_coerce_type(expr_type, &list_types); match result_type { None => plan_err!( - "Can not find compatible types to compare {expr_type:?} with {list_types:?}" + "Can not find compatible types to compare {expr_type} with [{}]", + list_types.iter().join(", ") ), Some(data_type) => { // find the coerced type diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index c5a58d5c6d..b32aabbe5b 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -142,7 +142,7 @@ pub fn try_cast( } else if can_cast_types(&expr_type, &cast_type) { Ok(Arc::new(TryCastExpr::new(expr, cast_type))) } else { - not_impl_err!("Unsupported TRY_CAST from {expr_type:?} to {cast_type:?}") + not_impl_err!("Unsupported TRY_CAST from {expr_type} to {cast_type}") } } diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index fb491341f8..73df60c42e 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -111,7 +111,7 @@ pub fn create_physical_expr( input_dfschema: &DFSchema, execution_props: &ExecutionProps, ) -> Result> { - let input_schema: &Schema = &input_dfschema.into(); + let input_schema = input_dfschema.as_arrow(); match e { Expr::Alias(Alias { expr, metadata, .. }) => { @@ -407,6 +407,7 @@ where /// Convert a logical expression to a physical expression (without any simplification, etc) pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc { + // TODO this makes a deep copy of the Schema. Should take SchemaRef instead and avoid deep copy let df_schema = schema.clone().to_dfschema().unwrap(); let execution_props = ExecutionProps::new(); create_physical_expr(expr, &df_schema, &execution_props).unwrap() diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index ea07fc7149..7536def353 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -238,22 +238,23 @@ pub fn collect_columns(expr: &Arc) -> HashSet { columns } -/// Re-assign column indices referenced in predicate according to given schema. -/// This may be helpful when dealing with projections. -pub fn reassign_predicate_columns( - pred: Arc, +/// Re-assign indices of [`Column`]s within the given [`PhysicalExpr`] according to +/// the provided [`Schema`]. +/// +/// This can be useful when attempting to map an expression onto a different schema. +/// +/// # Errors +/// +/// This function will return an error if any column in the expression cannot be found +/// in the provided schema. +pub fn reassign_expr_columns( + expr: Arc, schema: &Schema, - ignore_not_found: bool, ) -> Result> { - pred.transform_down(|expr| { - let expr_any = expr.as_any(); - - if let Some(column) = expr_any.downcast_ref::() { - let index = match schema.index_of(column.name()) { - Ok(idx) => idx, - Err(_) if ignore_not_found => usize::MAX, - Err(e) => return Err(e.into()), - }; + expr.transform_down(|expr| { + if let Some(column) = expr.as_any().downcast_ref::() { + let index = schema.index_of(column.name())?; + return Ok(Transformed::yes(Arc::new(Column::new( column.name(), index, @@ -506,7 +507,7 @@ pub(crate) mod tests { } #[test] - fn test_reassign_predicate_columns_in_list() { + fn test_reassign_expr_columns_in_list() { let int_field = Field::new("should_not_matter", DataType::Int64, true); let dict_field = Field::new( "id", @@ -526,7 +527,7 @@ pub(crate) mod tests { ) .unwrap(); - let actual = reassign_predicate_columns(pred, &schema_small, false).unwrap(); + let actual = reassign_expr_columns(pred, &schema_small).unwrap(); let expected = in_list( Arc::new(Column::new_with_schema("id", &schema_small).unwrap()), diff --git a/datafusion/physical-expr/src/window/standard.rs b/datafusion/physical-expr/src/window/standard.rs index 7b208ea41f..e9e7f6abf6 100644 --- a/datafusion/physical-expr/src/window/standard.rs +++ b/datafusion/physical-expr/src/window/standard.rs @@ -161,6 +161,9 @@ impl WindowExpr for StandardWindowExpr { let field = self.expr.field()?; let out_type = field.data_type(); let sort_options = self.order_by.iter().map(|o| o.options).collect::>(); + // create a WindowAggState to clone when `window_agg_state` does not contain the respective + // group, which is faster than potentially creating a new one at every iteration + let new_state = WindowAggState::new(out_type)?; for (partition_row, partition_batch_state) in partition_batches.iter() { let window_state = if let Some(window_state) = window_agg_state.get_mut(partition_row) { @@ -170,7 +173,7 @@ impl WindowExpr for StandardWindowExpr { window_agg_state .entry(partition_row.clone()) .or_insert(WindowState { - state: WindowAggState::new(out_type)?, + state: new_state.clone(), window_fn: WindowFn::Builtin(evaluator), }) }; @@ -235,6 +238,9 @@ impl WindowExpr for StandardWindowExpr { } let out_col = if row_wise_results.is_empty() { new_empty_array(out_type) + } else if row_wise_results.len() == 1 { + // fast path when the result only has a single row + row_wise_results[0].to_array()? } else { ScalarValue::iter_to_array(row_wise_results.into_iter())? }; diff --git a/datafusion/physical-optimizer/README.md b/datafusion/physical-optimizer/README.md index 374351b802..3efbc19d2e 100644 --- a/datafusion/physical-optimizer/README.md +++ b/datafusion/physical-optimizer/README.md @@ -17,10 +17,9 @@ under the License. --> -# DataFusion Physical Optimizer +# Apache DataFusion Physical Optimizer -DataFusion is an extensible query execution framework, written in Rust, -that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate contains the physical optimizer for DataFusion. @@ -28,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 2e56e2cdb3..a796cf2cc4 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -34,12 +34,12 @@ pub mod ensure_coop; pub mod filter_pushdown; pub mod join_selection; pub mod limit_pushdown; +pub mod limit_pushdown_past_window; pub mod limited_distinct_aggregation; pub mod optimizer; pub mod output_requirements; pub mod projection_pushdown; pub use datafusion_pruning as pruning; -mod limit_pushdown_past_window; pub mod sanity_checker; pub mod topk_aggregation; pub mod update_aggr_exprs; diff --git a/datafusion/physical-plan/README.md b/datafusion/physical-plan/README.md index 37cc165801..3a33100f2f 100644 --- a/datafusion/physical-plan/README.md +++ b/datafusion/physical-plan/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Physical Plan +# Apache DataFusion Physical Plan -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate is a submodule of DataFusion that contains the `ExecutionPlan` trait and the various implementations of that trait for built in operators such as filters, projections, joins, aggregations, etc. @@ -28,5 +28,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/physical-plan/benches/spill_io.rs b/datafusion/physical-plan/benches/spill_io.rs index e42c8073ae..40c8f7634c 100644 --- a/datafusion/physical-plan/benches/spill_io.rs +++ b/datafusion/physical-plan/benches/spill_io.rs @@ -115,8 +115,9 @@ fn bench_spill_io(c: &mut Criterion) { // - Wait for the consumer to finish processing |spill_file| { rt.block_on(async { - let stream = - spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }) }, @@ -519,8 +520,9 @@ fn benchmark_spill_batches_for_all_codec( ) .unwrap() .unwrap(); - let stream = - spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }) }, @@ -553,7 +555,9 @@ fn benchmark_spill_batches_for_all_codec( let rt = Runtime::new().unwrap(); let start = Instant::now(); rt.block_on(async { - let stream = spill_manager.read_spill_as_stream(spill_file).unwrap(); + let stream = spill_manager + .read_spill_as_stream(spill_file, None) + .unwrap(); let _ = collect(stream).await.unwrap(); }); let read_time = start.elapsed(); diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 1cad0ee85c..e386f79e42 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -935,7 +935,7 @@ impl TreeRenderVisitor<'_, '_> { } else { let total_spaces = max_render_width - render_width; let half_spaces = total_spaces / 2; - let extra_left_space = if total_spaces % 2 == 0 { 0 } else { 1 }; + let extra_left_space = if total_spaces.is_multiple_of(2) { 0 } else { 1 }; format!( "{}{}{}", " ".repeat(half_spaces + extra_left_space), diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 0a811a8826..047c72076e 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -265,7 +265,7 @@ impl FilterExec { default_selectivity, )?; let mut eq_properties = input.equivalence_properties().clone(); - let (equal_pairs, _) = collect_columns_from_predicate(predicate); + let (equal_pairs, _) = collect_columns_from_predicate_inner(predicate); for (lhs, rhs) in equal_pairs { eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))? } @@ -716,8 +716,18 @@ impl RecordBatchStream for FilterExecStream { } /// Return the equals Column-Pairs and Non-equals Column-Pairs +#[deprecated( + since = "51.0.0", + note = "This function will be internal in the future" +)] pub fn collect_columns_from_predicate( predicate: &'_ Arc, +) -> EqualAndNonEqual<'_> { + collect_columns_from_predicate_inner(predicate) +} + +fn collect_columns_from_predicate_inner( + predicate: &'_ Arc, ) -> EqualAndNonEqual<'_> { let mut eq_predicate_columns = Vec::::new(); let mut ne_predicate_columns = Vec::::new(); @@ -787,7 +797,7 @@ mod tests { &schema, )?; - let (equal_pairs, ne_pairs) = collect_columns_from_predicate(&predicate); + let (equal_pairs, ne_pairs) = collect_columns_from_predicate_inner(&predicate); assert_eq!(2, equal_pairs.len()); assert!(equal_pairs[0].0.eq(&col("c2", &schema)?)); assert!(equal_pairs[0].1.eq(&lit(4u32))); diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs index 1f9b6ce7fb..f6b1b7448f 100644 --- a/datafusion/physical-plan/src/filter_pushdown.rs +++ b/datafusion/physical-plan/src/filter_pushdown.rs @@ -38,7 +38,7 @@ use std::collections::HashSet; use std::sync::Arc; use datafusion_common::Result; -use datafusion_physical_expr::utils::{collect_columns, reassign_predicate_columns}; +use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns}; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use itertools::Itertools; @@ -343,7 +343,7 @@ impl ChildFilterDescription { // All columns exist in child - we can push down // Need to reassign column indices to match child schema let reassigned_filter = - reassign_predicate_columns(Arc::clone(filter), &child_schema, false)?; + reassign_expr_columns(Arc::clone(filter), &child_schema)?; child_parent_filters .push(PushedDownPredicate::supported(reassigned_filter)); } else { diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index c8ed196039..728497444c 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -725,6 +725,12 @@ impl DisplayAs for HashJoinExec { } else { "".to_string() }; + let display_null_equality = + if matches!(self.null_equality(), NullEquality::NullEqualsNull) { + ", NullsEqual: true" + } else { + "" + }; let on = self .on .iter() @@ -733,8 +739,13 @@ impl DisplayAs for HashJoinExec { .join(", "); write!( f, - "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}", - self.mode, self.join_type, on, display_filter, display_projections, + "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}{}", + self.mode, + self.join_type, + on, + display_filter, + display_projections, + display_null_equality, ) } DisplayFormatType::TreeRender => { @@ -753,6 +764,10 @@ impl DisplayAs for HashJoinExec { writeln!(f, "on={on}")?; + if matches!(self.null_equality(), NullEquality::NullEqualsNull) { + writeln!(f, "NullsEqual: true")?; + } + if let Some(filter) = self.filter.as_ref() { writeln!(f, "filter={filter}")?; } diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index ced9078d95..00d1613090 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -173,7 +173,8 @@ pub struct NestedLoopJoinExec { pub(crate) filter: Option, /// How the join is performed pub(crate) join_type: JoinType, - /// The schema once the join is applied + /// The full concatenated schema of left and right children should be distinct from + /// the output schema of the operator join_schema: SchemaRef, /// Future that consumes left input and buffers it in memory /// @@ -550,7 +551,7 @@ impl ExecutionPlan for NestedLoopJoinExec { self.right.partition_statistics(None)?, vec![], &self.join_type, - &self.join_schema, + &self.schema(), ) } @@ -2268,6 +2269,26 @@ pub(crate) mod tests { Ok(()) } + #[tokio::test] + async fn join_has_correct_stats() -> Result<()> { + let left = build_left_table(); + let right = build_right_table(); + let nested_loop_join = NestedLoopJoinExec::try_new( + left, + right, + None, + &JoinType::Left, + Some(vec![1, 2]), + )?; + let stats = nested_loop_join.partition_statistics(None)?; + assert_eq!( + nested_loop_join.schema().fields().len(), + stats.column_statistics.len(), + ); + assert_eq!(2, stats.column_statistics.len()); + Ok(()) + } + #[rstest] #[tokio::test] async fn join_right_semi_with_filter( diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs index 3ee8bf5260..592878a3bb 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs @@ -351,15 +351,22 @@ impl DisplayAs for SortMergeJoinExec { .map(|(c1, c2)| format!("({c1}, {c2})")) .collect::>() .join(", "); + let display_null_equality = + if matches!(self.null_equality(), NullEquality::NullEqualsNull) { + ", NullsEqual: true" + } else { + "" + }; write!( f, - "SortMergeJoin: join_type={:?}, on=[{}]{}", + "SortMergeJoin: join_type={:?}, on=[{}]{}{}", self.join_type, on, - self.filter.as_ref().map_or("".to_string(), |f| format!( - ", filter={}", - f.expression() - )) + self.filter.as_ref().map_or_else( + || "".to_string(), + |f| format!(", filter={}", f.expression()) + ), + display_null_equality, ) } DisplayFormatType::TreeRender => { @@ -375,7 +382,13 @@ impl DisplayAs for SortMergeJoinExec { if self.join_type() != JoinType::Inner { writeln!(f, "join_type={:?}", self.join_type)?; } - writeln!(f, "on={on}") + writeln!(f, "on={on}")?; + + if matches!(self.null_equality(), NullEquality::NullEqualsNull) { + writeln!(f, "NullsEqual: true")?; + } + + Ok(()) } } } diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index d392650f88..a62ae79635 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -563,15 +563,6 @@ fn estimate_inner_join_cardinality( .iter() .zip(right_stats.column_statistics.iter()) { - // Break if any of statistics bounds are undefined - if left_stat.min_value.get_value().is_none() - || left_stat.max_value.get_value().is_none() - || right_stat.min_value.get_value().is_none() - || right_stat.max_value.get_value().is_none() - { - return None; - } - let left_max_distinct = max_distinct_count(&left_stats.num_rows, left_stat); let right_max_distinct = max_distinct_count(&right_stats.num_rows, right_stat); let max_distinct = left_max_distinct.max(&right_max_distinct); @@ -658,7 +649,8 @@ fn estimate_disjoint_inputs( /// Estimate the number of maximum distinct values that can be present in the /// given column from its statistics. If distinct_count is available, uses it /// directly. Otherwise, if the column is numeric and has min/max values, it -/// estimates the maximum distinct count from those. +/// estimates the maximum distinct count from those. Otherwise, the num_rows +/// is used. fn max_distinct_count( num_rows: &Precision, stats: &ColumnStatistics, @@ -2014,12 +2006,18 @@ mod tests { (20, Inexact(1), Inexact(40), Absent, Absent), Some(Inexact(10)), ), - // When we have distinct count. + // Distinct count matches the range ( (10, Inexact(1), Inexact(10), Inexact(10), Absent), (10, Inexact(1), Inexact(10), Inexact(10), Absent), Some(Inexact(10)), ), + // Distinct count takes precedence over the range + ( + (10, Inexact(1), Inexact(3), Inexact(10), Absent), + (10, Inexact(1), Inexact(3), Inexact(10), Absent), + Some(Inexact(10)), + ), // distinct(left) > distinct(right) ( (10, Inexact(1), Inexact(10), Inexact(5), Absent), @@ -2063,32 +2061,33 @@ mod tests { // Edge cases // ========== // - // No column level stats. + // No column level stats, fall back to row count. ( (10, Absent, Absent, Absent, Absent), (10, Absent, Absent, Absent, Absent), - None, + Some(Inexact(10)), ), - // No min or max (or both). + // No min or max (or both), but distinct available. ( (10, Absent, Absent, Inexact(3), Absent), (10, Absent, Absent, Inexact(3), Absent), - None, + Some(Inexact(33)), ), ( (10, Inexact(2), Absent, Inexact(3), Absent), (10, Absent, Inexact(5), Inexact(3), Absent), - None, + Some(Inexact(33)), ), ( (10, Absent, Inexact(3), Inexact(3), Absent), (10, Inexact(1), Absent, Inexact(3), Absent), - None, + Some(Inexact(33)), ), + // No min or max, fall back to row count ( (10, Absent, Inexact(3), Absent, Absent), (10, Inexact(1), Absent, Absent, Absent), - None, + Some(Inexact(10)), ), // Non overlapping min/max (when exact=False). ( diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 4611c547f2..6a0cae20e5 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -105,7 +105,8 @@ impl DisplayAs for GlobalLimitExec { f, "GlobalLimitExec: skip={}, fetch={}", self.skip, - self.fetch.map_or("None".to_string(), |x| x.to_string()) + self.fetch + .map_or_else(|| "None".to_string(), |x| x.to_string()) ) } DisplayFormatType::TreeRender => { diff --git a/datafusion/physical-plan/src/sorts/multi_level_merge.rs b/datafusion/physical-plan/src/sorts/multi_level_merge.rs index bb6fc751b8..58d046cc90 100644 --- a/datafusion/physical-plan/src/sorts/multi_level_merge.rs +++ b/datafusion/physical-plan/src/sorts/multi_level_merge.rs @@ -237,7 +237,8 @@ impl MultiLevelMergeBuilder { let spill_file = self.sorted_spill_files.remove(0); // Not reserving any memory for this disk as we are not holding it in memory - self.spill_manager.read_spill_as_stream(spill_file.file) + self.spill_manager + .read_spill_as_stream(spill_file.file, None) } // Only in memory streams, so merge them all in a single pass @@ -274,10 +275,12 @@ impl MultiLevelMergeBuilder { .spill_manager .clone() .with_batch_read_buffer_capacity(buffer_size) - .read_spill_as_stream(spill.file)?; + .read_spill_as_stream( + spill.file, + Some(spill.max_record_batch_memory), + )?; sorted_streams.push(stream); } - let merge_sort_stream = self.create_new_merge_sort( sorted_streams, // If we have no sorted spill files left, this is the last run diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index fab62bff84..5b9a91e781 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -43,6 +43,7 @@ use datafusion_common_runtime::SpawnedTask; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::RecordBatchStream; use futures::{FutureExt as _, Stream}; +use log::warn; /// Stream that reads spill files from disk where each batch is read in a spawned blocking task /// It will read one batch at a time and will not do any buffering, to buffer data use [`crate::common::spawn_buffered`] @@ -54,8 +55,16 @@ use futures::{FutureExt as _, Stream}; struct SpillReaderStream { schema: SchemaRef, state: SpillReaderStreamState, + /// Maximum memory size observed among spilling sorted record batches. + /// This is used for validation purposes during reading each RecordBatch from spill. + /// For context on why this value is recorded and validated, + /// see `physical_plan/sort/multi_level_merge.rs`. + max_record_batch_memory: Option, } +// Small margin allowed to accommodate slight memory accounting variation +const SPILL_BATCH_MEMORY_MARGIN: usize = 4096; + /// When we poll for the next batch, we will get back both the batch and the reader, /// so we can call `next` again. type NextRecordBatchResult = Result<(StreamReader>, Option)>; @@ -76,10 +85,15 @@ enum SpillReaderStreamState { } impl SpillReaderStream { - fn new(schema: SchemaRef, spill_file: RefCountedTempFile) -> Self { + fn new( + schema: SchemaRef, + spill_file: RefCountedTempFile, + max_record_batch_memory: Option, + ) -> Self { Self { schema, state: SpillReaderStreamState::Uninitialized(spill_file), + max_record_batch_memory, } } @@ -125,6 +139,23 @@ impl SpillReaderStream { Ok((reader, batch)) => { match batch { Some(batch) => { + if let Some(max_record_batch_memory) = + self.max_record_batch_memory + { + let actual_size = + get_record_batch_memory_size(&batch); + if actual_size + > max_record_batch_memory + + SPILL_BATCH_MEMORY_MARGIN + { + warn!( + "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes) \n\ + by more than the allowed tolerance ({SPILL_BATCH_MEMORY_MARGIN} bytes).\n\ + This likely indicates a bug in memory accounting during spilling.\n\ + Please report this issue in https://github.com/apache/datafusion/issues/17340." + ); + } + } self.state = SpillReaderStreamState::Waiting(reader); Poll::Ready(Some(Ok(batch))) @@ -417,7 +448,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -481,7 +512,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), dict_schema); let batches = collect(stream).await?; assert_eq!(batches.len(), 2); @@ -512,7 +543,7 @@ mod tests { assert!(spill_file.path().exists()); assert!(max_batch_mem > 0); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -547,7 +578,7 @@ mod tests { let spilled_rows = spill_manager.metrics.spilled_rows.value(); assert_eq!(spilled_rows, num_rows); - let stream = spill_manager.read_spill_as_stream(spill_file)?; + let stream = spill_manager.read_spill_as_stream(spill_file, None)?; assert_eq!(stream.schema(), schema); let batches = collect(stream).await?; @@ -752,7 +783,7 @@ mod tests { .unwrap(); let size = get_record_batch_memory_size(&batch); - assert_eq!(size, 8320); + assert_eq!(size, 8208); } // ==== Spill manager tests ==== @@ -931,8 +962,10 @@ mod tests { .spill_record_batch_and_finish(&batches, "Test2")? .unwrap(); - let mut stream_1 = spill_manager.read_spill_as_stream(spill_file_1)?; - let mut stream_2 = spill_manager.read_spill_as_stream(spill_file_2)?; + let mut stream_1 = + spill_manager.read_spill_as_stream(spill_file_1, None)?; + let mut stream_2 = + spill_manager.read_spill_as_stream(spill_file_2, None)?; stream_1.next().await; stream_2.next().await; diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs index ad23bd66a0..cc39102d89 100644 --- a/datafusion/physical-plan/src/spill/spill_manager.rs +++ b/datafusion/physical-plan/src/spill/spill_manager.rs @@ -174,10 +174,12 @@ impl SpillManager { pub fn read_spill_as_stream( &self, spill_file_path: RefCountedTempFile, + max_record_batch_memory: Option, ) -> Result { let stream = Box::pin(cooperative(SpillReaderStream::new( Arc::clone(&self.schema), spill_file_path, + max_record_batch_memory, ))); Ok(spawn_buffered(stream, self.batch_read_buffer_capacity)) diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index e51b2d3f56..8b6a40625b 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -429,16 +429,25 @@ trait PartitionSearcher: Send { let partition_batches = self.evaluate_partition_batches(&record_batch, window_expr)?; for (partition_row, partition_batch) in partition_batches { - let partition_batch_state = partition_buffers - .entry(partition_row) + if let Some(partition_batch_state) = partition_buffers.get_mut(&partition_row) + { + partition_batch_state.extend(&partition_batch)? + } else { + let options = RecordBatchOptions::new() + .with_row_count(Some(partition_batch.num_rows())); // Use input_schema for the buffer schema, not `record_batch.schema()` // as it may not have the "correct" schema in terms of output // nullability constraints. For details, see the following issue: // https://github.com/apache/datafusion/issues/9320 - .or_insert_with(|| { - PartitionBatchState::new(Arc::clone(self.input_schema())) - }); - partition_batch_state.extend(&partition_batch)?; + let partition_batch = RecordBatch::try_new_with_options( + Arc::clone(self.input_schema()), + partition_batch.columns().to_vec(), + &options, + )?; + let partition_batch_state = + PartitionBatchState::new_with_batch(partition_batch); + partition_buffers.insert(partition_row, partition_batch_state); + } } if self.is_mode_linear() { @@ -870,9 +879,11 @@ impl SortedSearch { cur_window_expr_out_result_len }); argmin(out_col_counts).map_or(0, |(min_idx, minima)| { - for (row, count) in counts.swap_remove(min_idx).into_iter() { - let partition_batch = &mut partition_buffers[row]; - partition_batch.n_out_row = count; + let mut slowest_partition = counts.swap_remove(min_idx); + for (partition_key, partition_batch) in partition_buffers.iter_mut() { + if let Some(count) = slowest_partition.remove(partition_key) { + partition_batch.n_out_row = count; + } } minima }) @@ -1176,6 +1187,7 @@ fn get_aggregate_result_out_column( ) -> Result { let mut result = None; let mut running_length = 0; + let mut batches_to_concat = vec![]; // We assume that iteration order is according to insertion order for ( _, @@ -1187,16 +1199,25 @@ fn get_aggregate_result_out_column( { if running_length < len_to_show { let n_to_use = min(len_to_show - running_length, out_col.len()); - let slice_to_use = out_col.slice(0, n_to_use); - result = Some(match result { - Some(arr) => concat(&[&arr, &slice_to_use])?, - None => slice_to_use, - }); + let slice_to_use = if n_to_use == out_col.len() { + // avoid slice when the entire column is used + Arc::clone(out_col) + } else { + out_col.slice(0, n_to_use) + }; + batches_to_concat.push(slice_to_use); running_length += n_to_use; } else { break; } } + + if !batches_to_concat.is_empty() { + let array_refs: Vec<&dyn Array> = + batches_to_concat.iter().map(|a| a.as_ref()).collect(); + result = Some(concat(&array_refs)?); + } + if running_length != len_to_show { return exec_err!( "Generated row number should be {len_to_show}, it is {running_length}" @@ -1375,7 +1396,7 @@ mod tests { &partitionby_exprs, &orderby_exprs, Arc::new(window_frame), - &input.schema(), + input.schema(), false, false, None, diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index dccd9200fc..829cadf50a 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -96,7 +96,7 @@ pub fn create_window_expr( partition_by: &[Arc], order_by: &[PhysicalSortExpr], window_frame: Arc, - input_schema: &Schema, + input_schema: SchemaRef, ignore_nulls: bool, distinct: bool, filter: Option>, @@ -105,7 +105,7 @@ pub fn create_window_expr( WindowFunctionDefinition::AggregateUDF(fun) => { let aggregate = if distinct { AggregateExprBuilder::new(Arc::clone(fun), args.to_vec()) - .schema(Arc::new(input_schema.clone())) + .schema(input_schema) .alias(name) .with_ignore_nulls(ignore_nulls) .distinct() @@ -113,7 +113,7 @@ pub fn create_window_expr( .map(Arc::new)? } else { AggregateExprBuilder::new(Arc::clone(fun), args.to_vec()) - .schema(Arc::new(input_schema.clone())) + .schema(input_schema) .alias(name) .with_ignore_nulls(ignore_nulls) .build() @@ -128,7 +128,7 @@ pub fn create_window_expr( ) } WindowFunctionDefinition::WindowUDF(fun) => Arc::new(StandardWindowExpr::new( - create_udwf_window_expr(fun, args, input_schema, name, ignore_nulls)?, + create_udwf_window_expr(fun, args, &input_schema, name, ignore_nulls)?, partition_by, order_by, window_frame, @@ -371,17 +371,40 @@ pub(crate) fn window_equivalence_properties( for (i, expr) in window_exprs.iter().enumerate() { let partitioning_exprs = expr.partition_by(); let no_partitioning = partitioning_exprs.is_empty(); - // Collect columns defining partitioning, and construct all `SortOptions` - // variations for them. Then, we will check each one whether it satisfies - // the existing ordering provided by the input plan. + + // Find "one" valid ordering for partition columns to avoid exponential complexity. + // see https://github.com/apache/datafusion/issues/17401 let mut all_satisfied_lexs = vec![]; - for lex in partitioning_exprs - .iter() - .map(|pb_order| sort_options_resolving_constant(Arc::clone(pb_order))) - .multi_cartesian_product() - .filter_map(LexOrdering::new) - { - if window_eq_properties.ordering_satisfy(lex.clone())? { + let mut candidate_ordering = vec![]; + + for partition_expr in partitioning_exprs.iter() { + let sort_options = + sort_options_resolving_constant(Arc::clone(partition_expr), true); + + // Try each sort option and pick the first one that works + let mut found = false; + for sort_expr in sort_options.into_iter() { + candidate_ordering.push(sort_expr); + if let Some(lex) = LexOrdering::new(candidate_ordering.clone()) { + if window_eq_properties.ordering_satisfy(lex)? { + found = true; + break; + } + } + // This option didn't work, remove it and try the next one + candidate_ordering.pop(); + } + // If no sort option works for this column, we can't build a valid ordering + if !found { + candidate_ordering.clear(); + break; + } + } + + // If we successfully built an ordering for all columns, use it + // When there are no partition expressions, candidate_ordering will be empty and won't be added + if candidate_ordering.len() == partitioning_exprs.len() { + if let Some(lex) = LexOrdering::new(candidate_ordering) { all_satisfied_lexs.push(lex); } } @@ -410,8 +433,10 @@ pub(crate) fn window_equivalence_properties( // Window function results in a partial constant value in // some ordering. Adjust the ordering equivalences accordingly: let new_lexs = all_satisfied_lexs.into_iter().flat_map(|lex| { - let new_partial_consts = - sort_options_resolving_constant(Arc::clone(&window_col)); + let new_partial_consts = sort_options_resolving_constant( + Arc::clone(&window_col), + false, + ); new_partial_consts.into_iter().map(move |partial| { let mut existing = lex.clone(); @@ -467,23 +492,52 @@ pub(crate) fn window_equivalence_properties( // utilize set-monotonicity since the set shrinks as the frame // boundary starts "touching" the end of the table. else if frame.is_causal() { - let args_all_lexs = sliding_expr - .get_aggregate_expr() - .expressions() - .into_iter() - .map(sort_options_resolving_constant) - .multi_cartesian_product(); - - let (mut asc, mut satisfied) = (false, false); - for order in args_all_lexs { - if let Some(f) = order.first() { - asc = !f.options.descending; + // Find one valid ordering for aggregate arguments instead of + // checking all combinations + let aggregate_exprs = sliding_expr.get_aggregate_expr().expressions(); + let mut candidate_order = vec![]; + let mut asc = false; + + for (idx, expr) in aggregate_exprs.iter().enumerate() { + let mut found = false; + let sort_options = + sort_options_resolving_constant(Arc::clone(expr), false); + + // Try each option and pick the first that works + for sort_expr in sort_options.into_iter() { + let is_asc = !sort_expr.options.descending; + candidate_order.push(sort_expr); + + if let Some(lex) = LexOrdering::new(candidate_order.clone()) { + if window_eq_properties.ordering_satisfy(lex)? { + if idx == 0 { + // The first column's ordering direction determines the overall + // monotonicity behavior of the window result. + // - If the aggregate has increasing set monotonicity (e.g., MAX, COUNT) + // and the first arg is ascending, the window result is increasing + // - If the aggregate has decreasing set monotonicity (e.g., MIN) + // and the first arg is ascending, the window result is also increasing + // This flag is used to determine the final window column ordering. + asc = is_asc; + } + found = true; + break; + } + } + // This option didn't work, remove it and try the next one + candidate_order.pop(); } - if window_eq_properties.ordering_satisfy(order)? { - satisfied = true; + + // If we couldn't extend the ordering, stop trying + if !found { break; } } + + // Check if we successfully built a complete ordering + let satisfied = candidate_order.len() == aggregate_exprs.len() + && !aggregate_exprs.is_empty(); + if satisfied { let increasing = set_monotonicity.eq(&SetMonotonicity::Increasing); @@ -634,11 +688,45 @@ pub fn get_window_mode( Ok(None) } -fn sort_options_resolving_constant(expr: Arc) -> Vec { - vec![ - PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), - PhysicalSortExpr::new(expr, SortOptions::new(true, true)), - ] +/// Generates sort option variations for a given expression. +/// +/// This function is used to handle constant columns in window operations. Since constant +/// columns can be considered as having any ordering, we generate multiple sort options +/// to explore different ordering possibilities. +/// +/// # Parameters +/// - `expr`: The physical expression to generate sort options for +/// - `only_monotonic`: If false, generates all 4 possible sort options (ASC/DESC × NULLS FIRST/LAST). +/// If true, generates only 2 options that preserve set monotonicity. +/// +/// # When to use `only_monotonic = false`: +/// Use for PARTITION BY columns where we want to explore all possible orderings to find +/// one that matches the existing data ordering. +/// +/// # When to use `only_monotonic = true`: +/// Use for aggregate/window function arguments where set monotonicity needs to be preserved. +/// Only generates ASC NULLS LAST and DESC NULLS FIRST because: +/// - Set monotonicity is broken if data has increasing order but nulls come first +/// - Set monotonicity is broken if data has decreasing order but nulls come last +fn sort_options_resolving_constant( + expr: Arc, + only_monotonic: bool, +) -> Vec { + if only_monotonic { + // Generate only the 2 options that preserve set monotonicity + vec![ + PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), // ASC NULLS LAST + PhysicalSortExpr::new(expr, SortOptions::new(true, true)), // DESC NULLS FIRST + ] + } else { + // Generate all 4 possible sort options for partition columns + vec![ + PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), // ASC NULLS LAST + PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, true)), // ASC NULLS FIRST + PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(true, false)), // DESC NULLS LAST + PhysicalSortExpr::new(expr, SortOptions::new(true, true)), // DESC NULLS FIRST + ] + } } #[cfg(test)] @@ -814,7 +902,7 @@ mod tests { &[], &[], Arc::new(WindowFrame::new(None)), - schema.as_ref(), + schema, false, false, None, diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml index 3e2bf6aef8..c67c8892a3 100644 --- a/datafusion/proto-common/Cargo.toml +++ b/datafusion/proto-common/Cargo.toml @@ -19,9 +19,9 @@ name = "datafusion-proto-common" description = "Protobuf serialization of DataFusion common types" keywords = ["arrow", "query", "sql"] +readme = "README.md" version = { workspace = true } edition = { workspace = true } -readme = { workspace = true } homepage = { workspace = true } repository = { workspace = true } license = { workspace = true } diff --git a/datafusion/proto-common/README.md b/datafusion/proto-common/README.md index 67b3b27870..9c4aa707b0 100644 --- a/datafusion/proto-common/README.md +++ b/datafusion/proto-common/README.md @@ -17,17 +17,21 @@ under the License. --> -# `datafusion-proto-common`: Apache DataFusion Protobuf Serialization / Deserialization +# Apache DataFusion Protobuf Common Serialization / Deserialization -This crate contains code to convert Apache [DataFusion] primitive types to and from -bytes, which can be useful for sending data over the network. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate contains code to convert DataFusion primitive types to and from +bytes using [Protocol Buffers], which can be useful for sending data over the network. See [API Docs] for details and examples. Most projects should use the [`datafusion-proto`] crate directly, which re-exports -this module. If you are already using the [`datafusion-protp`] crate, there is no +this module. If you are already using the [`datafusion-proto`] crate, there is no reason to use this crate directly in your project as well. +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[protocol buffers]: https://protobuf.dev/ [`datafusion-proto`]: https://crates.io/crates/datafusion-proto -[datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-proto/latest diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml index cfd3368b0c..ef56d2697d 100644 --- a/datafusion/proto-common/gen/Cargo.toml +++ b/datafusion/proto-common/gen/Cargo.toml @@ -34,5 +34,5 @@ workspace = true [dependencies] # Pin these dependencies so that the generated output is deterministic -pbjson-build = "=0.7.0" -prost-build = "=0.13.5" +pbjson-build = "=0.8.0" +prost-build = "=0.14.1" diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index d89f73269c..267953556b 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -581,6 +581,10 @@ message ParquetOptions { oneof coerce_int96_opt { string coerce_int96 = 32; } + + oneof max_predicate_cache_size_opt { + uint64 max_predicate_cache_size = 33; + } } enum JoinSide { diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index bbfd0dfd2a..2d07fb8410 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -37,7 +37,6 @@ use datafusion_common::{ TableParquetOptions, }, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, - not_impl_err, parsers::CompressionTypeVariant, plan_datafusion_err, stats::Precision, @@ -478,13 +477,13 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { let null_type: DataType = v.try_into()?; null_type.try_into().map_err(Error::DataFusionError)? } - Value::Decimal32Value(_val) => { - return not_impl_err!("Decimal32 protobuf deserialization") - .map_err(Error::DataFusionError) + Value::Decimal32Value(val) => { + let array = vec_to_array(val.value.clone()); + Self::Decimal32(Some(i32::from_be_bytes(array)), val.p as u8, val.s as i8) } - Value::Decimal64Value(_val) => { - return not_impl_err!("Decimal64 protobuf deserialization") - .map_err(Error::DataFusionError) + Value::Decimal64Value(val) => { + let array = vec_to_array(val.value.clone()); + Self::Decimal64(Some(i64::from_be_bytes(array)), val.p as u8, val.s as i8) } Value::Decimal128Value(val) => { let array = vec_to_array(val.value.clone()); @@ -1000,6 +999,9 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::CoerceInt96Opt::CoerceInt96(v) => Some(v), }).unwrap_or(None), skip_arrow_metadata: value.skip_arrow_metadata, + max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt { + protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize), + }).unwrap_or(None), }) } } diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index d0c699106f..e63f345459 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -2007,7 +2007,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { if truncated_rows__.is_some() { return Err(serde::de::Error::duplicate_field("truncatedRows")); } - truncated_rows__ = + truncated_rows__ = Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) ; } @@ -5632,6 +5632,9 @@ impl serde::Serialize for ParquetOptions { if self.coerce_int96_opt.is_some() { len += 1; } + if self.max_predicate_cache_size_opt.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetOptions", len)?; if self.enable_page_index { struct_ser.serialize_field("enablePageIndex", &self.enable_page_index)?; @@ -5785,6 +5788,15 @@ impl serde::Serialize for ParquetOptions { } } } + if let Some(v) = self.max_predicate_cache_size_opt.as_ref() { + match v { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("maxPredicateCacheSize", ToString::to_string(&v).as_str())?; + } + } + } struct_ser.end() } } @@ -5852,6 +5864,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "bloomFilterNdv", "coerce_int96", "coerceInt96", + "max_predicate_cache_size", + "maxPredicateCacheSize", ]; #[allow(clippy::enum_variant_names)] @@ -5886,6 +5900,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { BloomFilterFpp, BloomFilterNdv, CoerceInt96, + MaxPredicateCacheSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -5937,6 +5952,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), "coerceInt96" | "coerce_int96" => Ok(GeneratedField::CoerceInt96), + "maxPredicateCacheSize" | "max_predicate_cache_size" => Ok(GeneratedField::MaxPredicateCacheSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -5986,6 +6002,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; let mut coerce_int96_opt__ = None; + let mut max_predicate_cache_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::EnablePageIndex => { @@ -6182,6 +6199,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } coerce_int96_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::CoerceInt96Opt::CoerceInt96); } + GeneratedField::MaxPredicateCacheSize => { + if max_predicate_cache_size_opt__.is_some() { + return Err(serde::de::Error::duplicate_field("maxPredicateCacheSize")); + } + max_predicate_cache_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(x.0)); + } } } Ok(ParquetOptions { @@ -6215,6 +6238,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, coerce_int96_opt: coerce_int96_opt__, + max_predicate_cache_size_opt: max_predicate_cache_size_opt__, }) } } diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index f09eef6786..aa7c3d51a9 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -1,10 +1,10 @@ // This file is @generated by prost-build. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ColumnRelation { #[prost(string, tag = "1")] pub relation: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Column { #[prost(string, tag = "1")] pub name: ::prost::alloc::string::String, @@ -28,7 +28,7 @@ pub struct DfSchema { ::prost::alloc::string::String, >, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvFormat { #[prost(message, optional, tag = "5")] pub options: ::core::option::Option, @@ -38,33 +38,33 @@ pub struct ParquetFormat { #[prost(message, optional, tag = "2")] pub options: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct AvroFormat {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct NdJsonFormat { #[prost(message, optional, tag = "1")] pub options: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ArrowFormat {} -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PrimaryKeyConstraint { #[prost(uint64, repeated, tag = "1")] pub indices: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct UniqueConstraint { #[prost(uint64, repeated, tag = "1")] pub indices: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Constraint { #[prost(oneof = "constraint::ConstraintMode", tags = "1, 2")] pub constraint_mode: ::core::option::Option, } /// Nested message and enum types in `Constraint`. pub mod constraint { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum ConstraintMode { #[prost(message, tag = "1")] PrimaryKey(super::PrimaryKeyConstraint), @@ -77,9 +77,9 @@ pub struct Constraints { #[prost(message, repeated, tag = "1")] pub constraints: ::prost::alloc::vec::Vec, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct AvroOptions {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ArrowOptions {} #[derive(Clone, PartialEq, ::prost::Message)] pub struct Schema { @@ -109,35 +109,35 @@ pub struct Field { ::prost::alloc::string::String, >, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Timestamp { #[prost(enumeration = "TimeUnit", tag = "1")] pub time_unit: i32, #[prost(string, tag = "2")] pub timezone: ::prost::alloc::string::String, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal32Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal64Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal256Type { #[prost(uint32, tag = "3")] pub precision: u32, @@ -198,7 +198,7 @@ pub struct ScalarNestedValue { } /// Nested message and enum types in `ScalarNestedValue`. pub mod scalar_nested_value { - #[derive(Clone, PartialEq, ::prost::Message)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Dictionary { #[prost(bytes = "vec", tag = "1")] pub ipc_message: ::prost::alloc::vec::Vec, @@ -206,14 +206,14 @@ pub mod scalar_nested_value { pub arrow_data: ::prost::alloc::vec::Vec, } } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTime32Value { #[prost(oneof = "scalar_time32_value::Value", tags = "1, 2")] pub value: ::core::option::Option, } /// Nested message and enum types in `ScalarTime32Value`. pub mod scalar_time32_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int32, tag = "1")] Time32SecondValue(i32), @@ -221,14 +221,14 @@ pub mod scalar_time32_value { Time32MillisecondValue(i32), } } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTime64Value { #[prost(oneof = "scalar_time64_value::Value", tags = "1, 2")] pub value: ::core::option::Option, } /// Nested message and enum types in `ScalarTime64Value`. pub mod scalar_time64_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] Time64MicrosecondValue(i64), @@ -236,7 +236,7 @@ pub mod scalar_time64_value { Time64NanosecondValue(i64), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTimestampValue { #[prost(string, tag = "5")] pub timezone: ::prost::alloc::string::String, @@ -245,7 +245,7 @@ pub struct ScalarTimestampValue { } /// Nested message and enum types in `ScalarTimestampValue`. pub mod scalar_timestamp_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] TimeMicrosecondValue(i64), @@ -264,14 +264,14 @@ pub struct ScalarDictionaryValue { #[prost(message, optional, boxed, tag = "2")] pub value: ::core::option::Option<::prost::alloc::boxed::Box>, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct IntervalDayTimeValue { #[prost(int32, tag = "1")] pub days: i32, #[prost(int32, tag = "2")] pub milliseconds: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct IntervalMonthDayNanoValue { #[prost(int32, tag = "1")] pub months: i32, @@ -300,7 +300,7 @@ pub struct UnionValue { #[prost(enumeration = "UnionMode", tag = "4")] pub mode: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarFixedSizeBinary { #[prost(bytes = "vec", tag = "1")] pub values: ::prost::alloc::vec::Vec, @@ -408,7 +408,7 @@ pub mod scalar_value { UnionValue(::prost::alloc::boxed::Box), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal32 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -417,7 +417,7 @@ pub struct Decimal32 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal64 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -426,7 +426,7 @@ pub struct Decimal64 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -435,7 +435,7 @@ pub struct Decimal128 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal256 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -549,14 +549,14 @@ pub mod arrow_type { /// i32 Two = 2; /// } /// } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct EmptyMessage {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct JsonWriterOptions { #[prost(enumeration = "CompressionTypeVariant", tag = "1")] pub compression: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvWriterOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -593,7 +593,7 @@ pub struct CsvWriterOptions { pub double_quote: bool, } /// Options controlling CSV format -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvOptions { /// Indicates if the CSV has a header row #[prost(bytes = "vec", tag = "1")] @@ -651,7 +651,7 @@ pub struct CsvOptions { pub truncated_rows: ::prost::alloc::vec::Vec, } /// Options controlling CSV format -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct JsonOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -708,27 +708,27 @@ pub struct ParquetColumnOptions { } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterEnabledOpt { #[prost(bool, tag = "1")] BloomFilterEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum EncodingOpt { #[prost(string, tag = "2")] Encoding(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "3")] DictionaryEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CompressionOpt { #[prost(string, tag = "4")] Compression(::prost::alloc::string::String), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsEnabledOpt { #[prost(string, tag = "5")] StatisticsEnabled(::prost::alloc::string::String), @@ -738,7 +738,7 @@ pub mod parquet_column_options { #[prost(double, tag = "6")] BloomFilterFpp(f64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), @@ -836,40 +836,44 @@ pub struct ParquetOptions { pub bloom_filter_ndv_opt: ::core::option::Option, #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")] pub coerce_int96_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")] + pub max_predicate_cache_size_opt: ::core::option::Option< + parquet_options::MaxPredicateCacheSizeOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum MetadataSizeHintOpt { #[prost(uint64, tag = "4")] MetadataSizeHint(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CompressionOpt { #[prost(string, tag = "10")] Compression(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "11")] DictionaryEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsEnabledOpt { #[prost(string, tag = "13")] StatisticsEnabled(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsTruncateLengthOpt { #[prost(uint64, tag = "31")] StatisticsTruncateLength(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum EncodingOpt { #[prost(string, tag = "19")] Encoding(::prost::alloc::string::String), @@ -879,16 +883,21 @@ pub mod parquet_options { #[prost(double, tag = "21")] BloomFilterFpp(f64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "22")] BloomFilterNdv(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CoerceInt96Opt { #[prost(string, tag = "32")] CoerceInt96(::prost::alloc::string::String), } + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] + pub enum MaxPredicateCacheSizeOpt { + #[prost(uint64, tag = "33")] + MaxPredicateCacheSize(u64), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct Precision { diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 2902a9ce54..8e4131479e 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -65,7 +65,7 @@ impl std::fmt::Display for Error { write!(f, "{value:?} is invalid as a DataFusion scalar value") } Self::InvalidScalarType(data_type) => { - write!(f, "{data_type:?} is invalid as a DataFusion scalar type") + write!(f, "{data_type} is invalid as a DataFusion scalar type") } Self::InvalidTimeUnit(time_unit) => { write!( @@ -405,6 +405,42 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue { }) }) } + ScalarValue::Decimal32(val, p, s) => match *val { + Some(v) => { + let array = v.to_be_bytes(); + let vec_val: Vec = array.to_vec(); + Ok(protobuf::ScalarValue { + value: Some(Value::Decimal32Value(protobuf::Decimal32 { + value: vec_val, + p: *p as i64, + s: *s as i64, + })), + }) + } + None => Ok(protobuf::ScalarValue { + value: Some(protobuf::scalar_value::Value::NullValue( + (&data_type).try_into()?, + )), + }), + }, + ScalarValue::Decimal64(val, p, s) => match *val { + Some(v) => { + let array = v.to_be_bytes(); + let vec_val: Vec = array.to_vec(); + Ok(protobuf::ScalarValue { + value: Some(Value::Decimal64Value(protobuf::Decimal64 { + value: vec_val, + p: *p as i64, + s: *s as i64, + })), + }) + } + None => Ok(protobuf::ScalarValue { + value: Some(protobuf::scalar_value::Value::NullValue( + (&data_type).try_into()?, + )), + }), + }, ScalarValue::Decimal128(val, p, s) => match *val { Some(v) => { let array = v.to_be_bytes(); @@ -842,6 +878,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { binary_as_string: value.binary_as_string, skip_arrow_metadata: value.skip_arrow_metadata, coerce_int96_opt: value.coerce_int96.clone().map(protobuf::parquet_options::CoerceInt96Opt::CoerceInt96), + max_predicate_cache_size_opt: value.max_predicate_cache_size.map(|v| protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v as u64)), }) } } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 9dc433f7ef..c1d894a6c0 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -46,7 +46,7 @@ avro = ["datafusion/avro", "datafusion-common/avro"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { workspace = true, default-features = false, features = ["parquet", "nested_expressions"] } +datafusion = { workspace = true, default-features = false } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } @@ -57,7 +57,12 @@ serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } [dev-dependencies] -datafusion = { workspace = true, default-features = false, features = ["sql"] } +datafusion = { workspace = true, default-features = false, features = [ + "sql", + "datetime_expressions", + "nested_expressions", + "unicode_expressions", +] } datafusion-functions = { workspace = true, default-features = true } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window-common = { workspace = true } diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index f8930779db..c1382c5b8f 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -17,13 +17,17 @@ under the License. --> -# `datafusion-proto`: Apache DataFusion Protobuf Serialization / Deserialization +# Apache DataFusion Protobuf Serialization / Deserialization -This crate contains code to convert [Apache DataFusion] plans to and from -bytes, which can be useful for sending plans over the network, for example -when building a distributed query engine. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate contains code to convert DataFusion plans to and from bytes using [Protocol Buffers], +which can be useful for sending plans over the network, for example when building a distributed +query engine. See [API Docs] for details and examples. -[apache datafusion]: https://datafusion.apache.org +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[protocol buffers]: https://protobuf.dev/ [api docs]: http://docs.rs/datafusion-proto/latest diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml index 467a7f487d..c2096b6011 100644 --- a/datafusion/proto/gen/Cargo.toml +++ b/datafusion/proto/gen/Cargo.toml @@ -34,5 +34,5 @@ workspace = true [dependencies] # Pin these dependencies so that the generated output is deterministic -pbjson-build = "=0.7.0" -prost-build = "=0.13.5" +pbjson-build = "=0.8.0" +prost-build = "=0.14.1" diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 4f411a4a93..ee9ac0e790 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -167,6 +167,7 @@ message CreateExternalTableNode { datafusion_common.DfSchema schema = 4; repeated string table_partition_cols = 5; bool if_not_exists = 6; + bool or_replace = 15; bool temporary = 14; string definition = 7; repeated SortExprNodeCollection order_exprs = 10; diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs index da01d89c0c..5b07e59e80 100644 --- a/datafusion/proto/src/bytes/mod.rs +++ b/datafusion/proto/src/bytes/mod.rs @@ -24,6 +24,7 @@ use crate::physical_plan::{ AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec, }; use crate::protobuf; +use datafusion::execution::TaskContext; use datafusion_common::{plan_datafusion_err, Result}; use datafusion_expr::{ create_udaf, create_udf, create_udwf, AggregateUDF, Expr, LogicalPlan, Volatility, @@ -170,6 +171,14 @@ impl Serializeable for Expr { fn expr_planners(&self) -> Vec> { vec![] } + + fn udafs(&self) -> std::collections::HashSet { + std::collections::HashSet::default() + } + + fn udwfs(&self) -> std::collections::HashSet { + std::collections::HashSet::default() + } } Expr::from_bytes_with_registry(&bytes, &PlaceHolderRegistry)?; @@ -308,13 +317,13 @@ pub fn physical_plan_from_json( let back: protobuf::PhysicalPlanNode = serde_json::from_str(json) .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?; let extension_codec = DefaultPhysicalExtensionCodec {}; - back.try_into_physical_plan(ctx, &ctx.runtime_env(), &extension_codec) + back.try_into_physical_plan(&ctx.task_ctx(), &extension_codec) } /// Deserialize a PhysicalPlan from bytes pub fn physical_plan_from_bytes( bytes: &[u8], - ctx: &SessionContext, + ctx: &TaskContext, ) -> Result> { let extension_codec = DefaultPhysicalExtensionCodec {}; physical_plan_from_bytes_with_extension_codec(bytes, ctx, &extension_codec) @@ -323,10 +332,10 @@ pub fn physical_plan_from_bytes( /// Deserialize a PhysicalPlan from bytes pub fn physical_plan_from_bytes_with_extension_codec( bytes: &[u8], - ctx: &SessionContext, + ctx: &TaskContext, extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let protobuf = protobuf::PhysicalPlanNode::decode(bytes) .map_err(|e| plan_datafusion_err!("Error decoding expr as protobuf: {e}"))?; - protobuf.try_into_physical_plan(ctx, &ctx.runtime_env(), extension_codec) + protobuf.try_into_physical_plan(ctx, extension_codec) } diff --git a/datafusion/proto/src/bytes/registry.rs b/datafusion/proto/src/bytes/registry.rs index eae2425f8a..5d46d41f79 100644 --- a/datafusion/proto/src/bytes/registry.rs +++ b/datafusion/proto/src/bytes/registry.rs @@ -59,4 +59,12 @@ impl FunctionRegistry for NoRegistry { fn expr_planners(&self) -> Vec> { vec![] } + + fn udafs(&self) -> HashSet { + HashSet::new() + } + + fn udwfs(&self) -> HashSet { + HashSet::new() + } } diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index f09eef6786..aa7c3d51a9 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -1,10 +1,10 @@ // This file is @generated by prost-build. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ColumnRelation { #[prost(string, tag = "1")] pub relation: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Column { #[prost(string, tag = "1")] pub name: ::prost::alloc::string::String, @@ -28,7 +28,7 @@ pub struct DfSchema { ::prost::alloc::string::String, >, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvFormat { #[prost(message, optional, tag = "5")] pub options: ::core::option::Option, @@ -38,33 +38,33 @@ pub struct ParquetFormat { #[prost(message, optional, tag = "2")] pub options: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct AvroFormat {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct NdJsonFormat { #[prost(message, optional, tag = "1")] pub options: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ArrowFormat {} -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PrimaryKeyConstraint { #[prost(uint64, repeated, tag = "1")] pub indices: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct UniqueConstraint { #[prost(uint64, repeated, tag = "1")] pub indices: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Constraint { #[prost(oneof = "constraint::ConstraintMode", tags = "1, 2")] pub constraint_mode: ::core::option::Option, } /// Nested message and enum types in `Constraint`. pub mod constraint { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum ConstraintMode { #[prost(message, tag = "1")] PrimaryKey(super::PrimaryKeyConstraint), @@ -77,9 +77,9 @@ pub struct Constraints { #[prost(message, repeated, tag = "1")] pub constraints: ::prost::alloc::vec::Vec, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct AvroOptions {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ArrowOptions {} #[derive(Clone, PartialEq, ::prost::Message)] pub struct Schema { @@ -109,35 +109,35 @@ pub struct Field { ::prost::alloc::string::String, >, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Timestamp { #[prost(enumeration = "TimeUnit", tag = "1")] pub time_unit: i32, #[prost(string, tag = "2")] pub timezone: ::prost::alloc::string::String, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal32Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal64Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] pub scale: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal256Type { #[prost(uint32, tag = "3")] pub precision: u32, @@ -198,7 +198,7 @@ pub struct ScalarNestedValue { } /// Nested message and enum types in `ScalarNestedValue`. pub mod scalar_nested_value { - #[derive(Clone, PartialEq, ::prost::Message)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Dictionary { #[prost(bytes = "vec", tag = "1")] pub ipc_message: ::prost::alloc::vec::Vec, @@ -206,14 +206,14 @@ pub mod scalar_nested_value { pub arrow_data: ::prost::alloc::vec::Vec, } } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTime32Value { #[prost(oneof = "scalar_time32_value::Value", tags = "1, 2")] pub value: ::core::option::Option, } /// Nested message and enum types in `ScalarTime32Value`. pub mod scalar_time32_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int32, tag = "1")] Time32SecondValue(i32), @@ -221,14 +221,14 @@ pub mod scalar_time32_value { Time32MillisecondValue(i32), } } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTime64Value { #[prost(oneof = "scalar_time64_value::Value", tags = "1, 2")] pub value: ::core::option::Option, } /// Nested message and enum types in `ScalarTime64Value`. pub mod scalar_time64_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] Time64MicrosecondValue(i64), @@ -236,7 +236,7 @@ pub mod scalar_time64_value { Time64NanosecondValue(i64), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarTimestampValue { #[prost(string, tag = "5")] pub timezone: ::prost::alloc::string::String, @@ -245,7 +245,7 @@ pub struct ScalarTimestampValue { } /// Nested message and enum types in `ScalarTimestampValue`. pub mod scalar_timestamp_value { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] TimeMicrosecondValue(i64), @@ -264,14 +264,14 @@ pub struct ScalarDictionaryValue { #[prost(message, optional, boxed, tag = "2")] pub value: ::core::option::Option<::prost::alloc::boxed::Box>, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct IntervalDayTimeValue { #[prost(int32, tag = "1")] pub days: i32, #[prost(int32, tag = "2")] pub milliseconds: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct IntervalMonthDayNanoValue { #[prost(int32, tag = "1")] pub months: i32, @@ -300,7 +300,7 @@ pub struct UnionValue { #[prost(enumeration = "UnionMode", tag = "4")] pub mode: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScalarFixedSizeBinary { #[prost(bytes = "vec", tag = "1")] pub values: ::prost::alloc::vec::Vec, @@ -408,7 +408,7 @@ pub mod scalar_value { UnionValue(::prost::alloc::boxed::Box), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal32 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -417,7 +417,7 @@ pub struct Decimal32 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal64 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -426,7 +426,7 @@ pub struct Decimal64 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -435,7 +435,7 @@ pub struct Decimal128 { #[prost(int64, tag = "3")] pub s: i64, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Decimal256 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -549,14 +549,14 @@ pub mod arrow_type { /// i32 Two = 2; /// } /// } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct EmptyMessage {} -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct JsonWriterOptions { #[prost(enumeration = "CompressionTypeVariant", tag = "1")] pub compression: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvWriterOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -593,7 +593,7 @@ pub struct CsvWriterOptions { pub double_quote: bool, } /// Options controlling CSV format -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct CsvOptions { /// Indicates if the CSV has a header row #[prost(bytes = "vec", tag = "1")] @@ -651,7 +651,7 @@ pub struct CsvOptions { pub truncated_rows: ::prost::alloc::vec::Vec, } /// Options controlling CSV format -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct JsonOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -708,27 +708,27 @@ pub struct ParquetColumnOptions { } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterEnabledOpt { #[prost(bool, tag = "1")] BloomFilterEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum EncodingOpt { #[prost(string, tag = "2")] Encoding(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "3")] DictionaryEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CompressionOpt { #[prost(string, tag = "4")] Compression(::prost::alloc::string::String), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsEnabledOpt { #[prost(string, tag = "5")] StatisticsEnabled(::prost::alloc::string::String), @@ -738,7 +738,7 @@ pub mod parquet_column_options { #[prost(double, tag = "6")] BloomFilterFpp(f64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), @@ -836,40 +836,44 @@ pub struct ParquetOptions { pub bloom_filter_ndv_opt: ::core::option::Option, #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")] pub coerce_int96_opt: ::core::option::Option, + #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")] + pub max_predicate_cache_size_opt: ::core::option::Option< + parquet_options::MaxPredicateCacheSizeOpt, + >, } /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum MetadataSizeHintOpt { #[prost(uint64, tag = "4")] MetadataSizeHint(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CompressionOpt { #[prost(string, tag = "10")] Compression(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "11")] DictionaryEnabled(bool), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsEnabledOpt { #[prost(string, tag = "13")] StatisticsEnabled(::prost::alloc::string::String), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum StatisticsTruncateLengthOpt { #[prost(uint64, tag = "31")] StatisticsTruncateLength(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum EncodingOpt { #[prost(string, tag = "19")] Encoding(::prost::alloc::string::String), @@ -879,16 +883,21 @@ pub mod parquet_options { #[prost(double, tag = "21")] BloomFilterFpp(f64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "22")] BloomFilterNdv(u64), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum CoerceInt96Opt { #[prost(string, tag = "32")] CoerceInt96(::prost::alloc::string::String), } + #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)] + pub enum MaxPredicateCacheSizeOpt { + #[prost(uint64, tag = "33")] + MaxPredicateCacheSize(u64), + } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct Precision { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index ff7519aa5d..29967d8120 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -3094,6 +3094,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { len += 1; } + if self.or_replace { + len += 1; + } if self.temporary { len += 1; } @@ -3134,6 +3137,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { struct_ser.serialize_field("ifNotExists", &self.if_not_exists)?; } + if self.or_replace { + struct_ser.serialize_field("orReplace", &self.or_replace)?; + } if self.temporary { struct_ser.serialize_field("temporary", &self.temporary)?; } @@ -3174,6 +3180,8 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "tablePartitionCols", "if_not_exists", "ifNotExists", + "or_replace", + "orReplace", "temporary", "definition", "order_exprs", @@ -3193,6 +3201,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { Schema, TablePartitionCols, IfNotExists, + OrReplace, Temporary, Definition, OrderExprs, @@ -3227,6 +3236,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "schema" => Ok(GeneratedField::Schema), "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols), "ifNotExists" | "if_not_exists" => Ok(GeneratedField::IfNotExists), + "orReplace" | "or_replace" => Ok(GeneratedField::OrReplace), "temporary" => Ok(GeneratedField::Temporary), "definition" => Ok(GeneratedField::Definition), "orderExprs" | "order_exprs" => Ok(GeneratedField::OrderExprs), @@ -3259,6 +3269,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { let mut schema__ = None; let mut table_partition_cols__ = None; let mut if_not_exists__ = None; + let mut or_replace__ = None; let mut temporary__ = None; let mut definition__ = None; let mut order_exprs__ = None; @@ -3304,6 +3315,12 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { } if_not_exists__ = Some(map_.next_value()?); } + GeneratedField::OrReplace => { + if or_replace__.is_some() { + return Err(serde::de::Error::duplicate_field("orReplace")); + } + or_replace__ = Some(map_.next_value()?); + } GeneratedField::Temporary => { if temporary__.is_some() { return Err(serde::de::Error::duplicate_field("temporary")); @@ -3359,6 +3376,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { schema: schema__, table_partition_cols: table_partition_cols__.unwrap_or_default(), if_not_exists: if_not_exists__.unwrap_or_default(), + or_replace: or_replace__.unwrap_or_default(), temporary: temporary__.unwrap_or_default(), definition: definition__.unwrap_or_default(), order_exprs: order_exprs__.unwrap_or_default(), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index ffb7308665..d3b5f566e9 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -86,7 +86,7 @@ pub struct LogicalExtensionNode { #[prost(message, repeated, tag = "2")] pub inputs: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ProjectionColumns { #[prost(string, repeated, tag = "1")] pub columns: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, @@ -185,7 +185,7 @@ pub struct ProjectionNode { } /// Nested message and enum types in `ProjectionNode`. pub mod projection_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum OptionalAlias { #[prost(string, tag = "3")] Alias(::prost::alloc::string::String), @@ -232,7 +232,7 @@ pub struct HashRepartition { #[prost(uint64, tag = "2")] pub partition_count: u64, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct EmptyRelationNode { #[prost(bool, tag = "1")] pub produce_one_row: bool, @@ -251,6 +251,8 @@ pub struct CreateExternalTableNode { pub table_partition_cols: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, #[prost(bool, tag = "6")] pub if_not_exists: bool, + #[prost(bool, tag = "15")] + pub or_replace: bool, #[prost(bool, tag = "14")] pub temporary: bool, #[prost(string, tag = "7")] @@ -485,7 +487,7 @@ pub struct UnnestNode { #[prost(message, optional, tag = "7")] pub options: ::core::option::Option, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ColumnUnnestListItem { #[prost(uint32, tag = "1")] pub input_index: u32, @@ -497,7 +499,7 @@ pub struct ColumnUnnestListRecursions { #[prost(message, repeated, tag = "2")] pub recursions: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct ColumnUnnestListRecursion { #[prost(message, optional, tag = "1")] pub output_column: ::core::option::Option, @@ -511,7 +513,7 @@ pub struct UnnestOptions { #[prost(message, repeated, tag = "2")] pub recursions: ::prost::alloc::vec::Vec, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct RecursionUnnestOption { #[prost(message, optional, tag = "1")] pub output_column: ::core::option::Option, @@ -640,7 +642,7 @@ pub mod logical_expr_node { Unnest(super::Unnest), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Wildcard { #[prost(message, optional, tag = "1")] pub qualifier: ::core::option::Option, @@ -821,7 +823,7 @@ pub struct WindowExprNode { } /// Nested message and enum types in `WindowExprNode`. pub mod window_expr_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum WindowFunction { /// BuiltInWindowFunction built_in_function = 2; #[prost(string, tag = "3")] @@ -941,27 +943,27 @@ pub struct WindowFrameBound { #[prost(message, optional, tag = "2")] pub bound_value: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct FixedSizeBinary { #[prost(int32, tag = "1")] pub length: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct AnalyzedLogicalPlanType { #[prost(string, tag = "1")] pub analyzer_name: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct OptimizedLogicalPlanType { #[prost(string, tag = "1")] pub optimizer_name: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct OptimizedPhysicalPlanType { #[prost(string, tag = "1")] pub optimizer_name: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PlanType { #[prost( oneof = "plan_type::PlanTypeEnum", @@ -971,7 +973,7 @@ pub struct PlanType { } /// Nested message and enum types in `PlanType`. pub mod plan_type { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum PlanTypeEnum { #[prost(message, tag = "1")] InitialLogicalPlan(super::super::datafusion_common::EmptyMessage), @@ -1001,26 +1003,26 @@ pub mod plan_type { PhysicalPlanError(super::super::datafusion_common::EmptyMessage), } } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct StringifiedPlan { #[prost(message, optional, tag = "1")] pub plan_type: ::core::option::Option, #[prost(string, tag = "2")] pub plan: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct BareTableReference { #[prost(string, tag = "1")] pub table: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PartialTableReference { #[prost(string, tag = "1")] pub schema: ::prost::alloc::string::String, #[prost(string, tag = "2")] pub table: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct FullTableReference { #[prost(string, tag = "1")] pub catalog: ::prost::alloc::string::String, @@ -1029,7 +1031,7 @@ pub struct FullTableReference { #[prost(string, tag = "3")] pub table: ::prost::alloc::string::String, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct TableReference { #[prost(oneof = "table_reference::TableReferenceEnum", tags = "1, 2, 3")] pub table_reference_enum: ::core::option::Option< @@ -1038,7 +1040,7 @@ pub struct TableReference { } /// Nested message and enum types in `TableReference`. pub mod table_reference { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum TableReferenceEnum { #[prost(message, tag = "1")] Bare(super::BareTableReference), @@ -1232,7 +1234,7 @@ pub struct UnnestExecNode { #[prost(message, optional, tag = "5")] pub options: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ListUnnest { #[prost(uint32, tag = "1")] pub index_in_input_schema: u32, @@ -1340,7 +1342,7 @@ pub struct PhysicalAggregateExprNode { } /// Nested message and enum types in `PhysicalAggregateExprNode`. pub mod physical_aggregate_expr_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum AggregateFunction { #[prost(string, tag = "4")] UserDefinedAggrFunction(::prost::alloc::string::String), @@ -1371,7 +1373,7 @@ pub struct PhysicalWindowExprNode { } /// Nested message and enum types in `PhysicalWindowExprNode`. pub mod physical_window_expr_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum WindowFunction { /// BuiltInWindowFunction built_in_function = 2; #[prost(string, tag = "3")] @@ -1507,7 +1509,7 @@ pub struct FileGroup { #[prost(message, repeated, tag = "1")] pub files: ::prost::alloc::vec::Vec, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ScanLimit { /// wrap into a message to make it optional #[prost(uint32, tag = "1")] @@ -1573,12 +1575,12 @@ pub struct CsvScanExecNode { } /// Nested message and enum types in `CsvScanExecNode`. pub mod csv_scan_exec_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum OptionalEscape { #[prost(string, tag = "5")] Escape(::prost::alloc::string::String), } - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum OptionalComment { #[prost(string, tag = "6")] Comment(::prost::alloc::string::String), @@ -1691,14 +1693,14 @@ pub struct CrossJoinExecNode { #[prost(message, optional, boxed, tag = "2")] pub right: ::core::option::Option<::prost::alloc::boxed::Box>, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PhysicalColumn { #[prost(string, tag = "1")] pub name: ::prost::alloc::string::String, #[prost(uint32, tag = "2")] pub index: u32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct UnknownColumn { #[prost(string, tag = "1")] pub name: ::prost::alloc::string::String, @@ -1729,7 +1731,7 @@ pub struct ProjectionExecNode { #[prost(string, repeated, tag = "3")] pub expr_name: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct PartiallySortedInputOrderMode { #[prost(uint64, repeated, tag = "6")] pub columns: ::prost::alloc::vec::Vec, @@ -1749,7 +1751,7 @@ pub struct WindowAggExecNode { /// Nested message and enum types in `WindowAggExecNode`. pub mod window_agg_exec_node { /// Set optional to `None` for `BoundedWindowAggExec`. - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum InputOrderMode { #[prost(message, tag = "7")] Linear(super::super::datafusion_common::EmptyMessage), @@ -1769,7 +1771,7 @@ pub struct MaybePhysicalSortExprs { #[prost(message, repeated, tag = "1")] pub sort_expr: ::prost::alloc::vec::Vec, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct AggLimit { /// wrap into a message to make it optional #[prost(uint64, tag = "1")] @@ -1915,7 +1917,7 @@ pub struct JoinFilter { #[prost(message, optional, tag = "3")] pub schema: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ColumnIndex { #[prost(uint32, tag = "1")] pub index: u32, @@ -1939,7 +1941,7 @@ pub struct PartitionedFile { #[prost(message, optional, tag = "6")] pub statistics: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct FileRange { #[prost(int64, tag = "1")] pub start: i64, @@ -1977,12 +1979,12 @@ pub struct CteWorkTableScanNode { #[prost(message, optional, tag = "2")] pub schema: ::core::option::Option, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GenerateSeriesArgsContainsNull { #[prost(enumeration = "GenerateSeriesName", tag = "1")] pub name: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GenerateSeriesArgsInt64 { #[prost(int64, tag = "1")] pub start: i64, @@ -1995,7 +1997,7 @@ pub struct GenerateSeriesArgsInt64 { #[prost(enumeration = "GenerateSeriesName", tag = "5")] pub name: i32, } -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct GenerateSeriesArgsTimestamp { #[prost(int64, tag = "1")] pub start: i64, @@ -2012,7 +2014,7 @@ pub struct GenerateSeriesArgsTimestamp { #[prost(enumeration = "GenerateSeriesName", tag = "6")] pub name: i32, } -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GenerateSeriesArgsDate { #[prost(int64, tag = "1")] pub start: i64, @@ -2038,7 +2040,7 @@ pub struct GenerateSeriesNode { } /// Nested message and enum types in `GenerateSeriesNode`. pub mod generate_series_node { - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)] pub enum Args { #[prost(message, tag = "3")] ContainsNull(super::GenerateSeriesArgsContainsNull), diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index b4d72aa1b6..691ee3f067 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -115,7 +115,7 @@ //! let bytes = physical_plan_to_bytes(physical_plan.clone())?; //! //! // Decode bytes from somewhere (over network, etc.) back to ExecutionPlan -//! let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; +//! let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?; //! assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip)); //! # Ok(()) //! # } diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 492795855c..f9989bdb2c 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -18,13 +18,10 @@ use std::sync::Arc; use datafusion::{ - config::{ - CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions, - TableParquetOptions, - }, + config::{CsvOptions, JsonOptions}, datasource::file_format::{ arrow::ArrowFormatFactory, csv::CsvFormatFactory, json::JsonFormatFactory, - parquet::ParquetFormatFactory, FileFormatFactory, + FileFormatFactory, }, prelude::SessionContext, }; @@ -34,12 +31,7 @@ use datafusion_common::{ }; use prost::Message; -use crate::protobuf::{ - parquet_column_options, parquet_options, CsvOptions as CsvOptionsProto, - JsonOptions as JsonOptionsProto, ParquetColumnOptions as ParquetColumnOptionsProto, - ParquetColumnSpecificOptions, ParquetOptions as ParquetOptionsProto, - TableParquetOptions as TableParquetOptionsProto, -}; +use crate::protobuf::{CsvOptions as CsvOptionsProto, JsonOptions as JsonOptionsProto}; use super::LogicalExtensionCodec; @@ -355,16 +347,32 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec { } } -impl TableParquetOptionsProto { - fn from_factory(factory: &ParquetFormatFactory) -> Self { - let global_options = if let Some(ref options) = factory.options { - options.clone() - } else { - return TableParquetOptionsProto::default(); - }; +#[cfg(feature = "parquet")] +mod parquet { + use super::*; + + use crate::protobuf::{ + parquet_column_options, parquet_options, + ParquetColumnOptions as ParquetColumnOptionsProto, ParquetColumnSpecificOptions, + ParquetOptions as ParquetOptionsProto, + TableParquetOptions as TableParquetOptionsProto, + }; + + use datafusion::{ + config::{ParquetColumnOptions, ParquetOptions, TableParquetOptions}, + datasource::file_format::parquet::ParquetFormatFactory, + }; + + impl TableParquetOptionsProto { + fn from_factory(factory: &ParquetFormatFactory) -> Self { + let global_options = if let Some(ref options) = factory.options { + options.clone() + } else { + return TableParquetOptionsProto::default(); + }; - let column_specific_options = global_options.column_specific_options; - #[allow(deprecated)] // max_statistics_size + let column_specific_options = global_options.column_specific_options; + #[allow(deprecated)] // max_statistics_size TableParquetOptionsProto { global: Some(ParquetOptionsProto { enable_page_index: global_options.global.enable_page_index, @@ -417,6 +425,9 @@ impl TableParquetOptionsProto { coerce_int96_opt: global_options.global.coerce_int96.map(|compression| { parquet_options::CoerceInt96Opt::CoerceInt96(compression) }), + max_predicate_cache_size_opt: global_options.global.max_predicate_cache_size.map(|size| { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size as u64) + }), }), column_specific_options: column_specific_options.into_iter().map(|(column_name, options)| { ParquetColumnSpecificOptions { @@ -453,12 +464,12 @@ impl TableParquetOptionsProto { }) .collect(), } + } } -} -impl From<&ParquetOptionsProto> for ParquetOptions { - fn from(proto: &ParquetOptionsProto) -> Self { - #[allow(deprecated)] // max_statistics_size + impl From<&ParquetOptionsProto> for ParquetOptions { + fn from(proto: &ParquetOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetOptions { enable_page_index: proto.enable_page_index, pruning: proto.pruning, @@ -510,13 +521,16 @@ impl From<&ParquetOptionsProto> for ParquetOptions { coerce_int96: proto.coerce_int96_opt.as_ref().map(|opt| match opt { parquet_options::CoerceInt96Opt::CoerceInt96(coerce_int96) => coerce_int96.clone(), }), + max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt { + parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize, + }), + } } } -} -impl From for ParquetColumnOptions { - fn from(proto: ParquetColumnOptionsProto) -> Self { - #[allow(deprecated)] // max_statistics_size + impl From for ParquetColumnOptions { + fn from(proto: ParquetColumnOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { bloom_filter_enabled: proto.bloom_filter_enabled_opt.map( |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v, @@ -540,124 +554,135 @@ impl From for ParquetColumnOptions { .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), } + } } -} -impl From<&TableParquetOptionsProto> for TableParquetOptions { - fn from(proto: &TableParquetOptionsProto) -> Self { - TableParquetOptions { - global: proto - .global - .as_ref() - .map(ParquetOptions::from) - .unwrap_or_default(), - column_specific_options: proto - .column_specific_options - .iter() - .map(|parquet_column_options| { - ( - parquet_column_options.column_name.clone(), - ParquetColumnOptions::from( - parquet_column_options.options.clone().unwrap_or_default(), - ), - ) - }) - .collect(), - key_value_metadata: proto - .key_value_metadata - .iter() - .map(|(k, v)| (k.clone(), Some(v.clone()))) - .collect(), - crypto: Default::default(), + impl From<&TableParquetOptionsProto> for TableParquetOptions { + fn from(proto: &TableParquetOptionsProto) -> Self { + TableParquetOptions { + global: proto + .global + .as_ref() + .map(ParquetOptions::from) + .unwrap_or_default(), + column_specific_options: proto + .column_specific_options + .iter() + .map(|parquet_column_options| { + ( + parquet_column_options.column_name.clone(), + ParquetColumnOptions::from( + parquet_column_options + .options + .clone() + .unwrap_or_default(), + ), + ) + }) + .collect(), + key_value_metadata: proto + .key_value_metadata + .iter() + .map(|(k, v)| (k.clone(), Some(v.clone()))) + .collect(), + crypto: Default::default(), + } } } -} -#[derive(Debug)] -pub struct ParquetLogicalExtensionCodec; + #[derive(Debug)] + pub struct ParquetLogicalExtensionCodec; -// TODO! This is a placeholder for now and needs to be implemented for real. -impl LogicalExtensionCodec for ParquetLogicalExtensionCodec { - fn try_decode( - &self, - _buf: &[u8], - _inputs: &[datafusion_expr::LogicalPlan], - _ctx: &SessionContext, - ) -> datafusion_common::Result { - not_impl_err!("Method not implemented") - } + // TODO! This is a placeholder for now and needs to be implemented for real. + impl LogicalExtensionCodec for ParquetLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } - fn try_encode( - &self, - _node: &datafusion_expr::Extension, - _buf: &mut Vec, - ) -> datafusion_common::Result<()> { - not_impl_err!("Method not implemented") - } + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } - fn try_decode_table_provider( - &self, - _buf: &[u8], - _table_ref: &TableReference, - _schema: arrow::datatypes::SchemaRef, - _ctx: &SessionContext, - ) -> datafusion_common::Result> { - not_impl_err!("Method not implemented") - } + fn try_decode_table_provider( + &self, + _buf: &[u8], + _table_ref: &TableReference, + _schema: arrow::datatypes::SchemaRef, + _ctx: &SessionContext, + ) -> datafusion_common::Result> + { + not_impl_err!("Method not implemented") + } - fn try_encode_table_provider( - &self, - _table_ref: &TableReference, - _node: Arc, - _buf: &mut Vec, - ) -> datafusion_common::Result<()> { - not_impl_err!("Method not implemented") - } + fn try_encode_table_provider( + &self, + _table_ref: &TableReference, + _node: Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } - fn try_decode_file_format( - &self, - buf: &[u8], - _ctx: &SessionContext, - ) -> datafusion_common::Result> { - let proto = TableParquetOptionsProto::decode(buf).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to decode TableParquetOptionsProto: {e:?}" + fn try_decode_file_format( + &self, + buf: &[u8], + _ctx: &SessionContext, + ) -> datafusion_common::Result> { + let proto = TableParquetOptionsProto::decode(buf).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to decode TableParquetOptionsProto: {e:?}" + )) + })?; + let options: TableParquetOptions = (&proto).into(); + Ok(Arc::new( + datafusion::datasource::file_format::parquet::ParquetFormatFactory { + options: Some(options), + }, )) - })?; - let options: TableParquetOptions = (&proto).into(); - Ok(Arc::new(ParquetFormatFactory { - options: Some(options), - })) - } + } - fn try_encode_file_format( - &self, - buf: &mut Vec, - node: Arc, - ) -> datafusion_common::Result<()> { - let options = if let Some(parquet_factory) = - node.as_any().downcast_ref::() - { - parquet_factory.options.clone().unwrap_or_default() - } else { - return Err(DataFusionError::Execution( - "Unsupported FileFormatFactory type".to_string(), - )); - }; + fn try_encode_file_format( + &self, + buf: &mut Vec, + node: Arc, + ) -> datafusion_common::Result<()> { + use datafusion::datasource::file_format::parquet::ParquetFormatFactory; + + let options = if let Some(parquet_factory) = + node.as_any().downcast_ref::() + { + parquet_factory.options.clone().unwrap_or_default() + } else { + return Err(DataFusionError::Execution( + "Unsupported FileFormatFactory type".to_string(), + )); + }; - let proto = TableParquetOptionsProto::from_factory(&ParquetFormatFactory { - options: Some(options), - }); + let proto = TableParquetOptionsProto::from_factory(&ParquetFormatFactory { + options: Some(options), + }); - proto.encode(buf).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to encode TableParquetOptionsProto: {e:?}" - )) - })?; + proto.encode(buf).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to encode TableParquetOptionsProto: {e:?}" + )) + })?; - Ok(()) + Ok(()) + } } } +#[cfg(feature = "parquet")] +pub use parquet::ParquetLogicalExtensionCodec; #[derive(Debug)] pub struct ArrowLogicalExtensionCodec; diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index cc3e805ed1..6687cc31a3 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -632,6 +632,7 @@ impl AsLogicalPlan for LogicalPlanNode { .clone(), order_exprs, if_not_exists: create_extern_table.if_not_exists, + or_replace: create_extern_table.or_replace, temporary: create_extern_table.temporary, definition, unbounded: create_extern_table.unbounded, @@ -1469,6 +1470,7 @@ impl AsLogicalPlan for LogicalPlanNode { schema: df_schema, table_partition_cols, if_not_exists, + or_replace, definition, order_exprs, unbounded, @@ -1502,6 +1504,7 @@ impl AsLogicalPlan for LogicalPlanNode { schema: Some(df_schema.try_into()?), table_partition_cols: table_partition_cols.clone(), if_not_exists: *if_not_exists, + or_replace: *or_replace, temporary: *temporary, order_exprs: converted_order_exprs, definition: definition.clone().unwrap_or_default(), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 85de56e972..ee69ab75b2 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -38,7 +38,7 @@ use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ FileGroup, FileScanConfig, FileScanConfigBuilder, FileSinkConfig, FileSource, }; -use datafusion::execution::FunctionRegistry; +use datafusion::execution::{FunctionRegistry, TaskContext}; use datafusion::logical_expr::WindowFunctionDefinition; use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr}; use datafusion::physical_plan::expressions::{ @@ -47,8 +47,6 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::windows::{create_window_expr, schema_add_window_field}; use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; -use datafusion::prelude::SessionContext; -use datafusion_common::config::ConfigOptions; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_proto_common::common::proto_error; @@ -76,7 +74,7 @@ impl From<&protobuf::PhysicalColumn> for Column { /// * `codec` - An extension codec used to decode custom UDFs. pub fn parse_physical_sort_expr( proto: &protobuf::PhysicalSortExprNode, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result { @@ -103,7 +101,7 @@ pub fn parse_physical_sort_expr( /// * `codec` - An extension codec used to decode custom UDFs. pub fn parse_physical_sort_exprs( proto: &[protobuf::PhysicalSortExprNode], - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result> { @@ -125,7 +123,7 @@ pub fn parse_physical_sort_exprs( /// * `codec` - An extension codec used to decode custom UDFs. pub fn parse_physical_window_expr( proto: &protobuf::PhysicalWindowExprNode, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result> { @@ -177,7 +175,7 @@ pub fn parse_physical_window_expr( &partition_by, &order_by, Arc::new(window_frame), - &extended_schema, + extended_schema, proto.ignore_nulls, proto.distinct, None, @@ -186,7 +184,7 @@ pub fn parse_physical_window_expr( pub fn parse_physical_exprs<'a, I>( protos: I, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result>> @@ -210,7 +208,7 @@ where /// * `codec` - An extension codec used to decode custom UDFs. pub fn parse_physical_expr( proto: &protobuf::PhysicalExprNode, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result> { @@ -364,11 +362,8 @@ pub fn parse_physical_expr( let scalar_fun_def = Arc::clone(&udf); let args = parse_physical_exprs(&e.args, ctx, input_schema, codec)?; - let config_options = - match ctx.state().execution_props().config_options.as_ref() { - Some(config_options) => Arc::clone(config_options), - None => Arc::new(ConfigOptions::default()), - }; + + let config_options = Arc::clone(ctx.session_config().options()); Arc::new( ScalarFunctionExpr::new( @@ -419,7 +414,7 @@ pub fn parse_physical_expr( fn parse_required_physical_expr( expr: Option<&protobuf::PhysicalExprNode>, - ctx: &SessionContext, + ctx: &TaskContext, field: &str, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, @@ -433,7 +428,7 @@ fn parse_required_physical_expr( pub fn parse_protobuf_hash_partitioning( partitioning: Option<&protobuf::PhysicalHashRepartition>, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result> { @@ -453,7 +448,7 @@ pub fn parse_protobuf_hash_partitioning( pub fn parse_protobuf_partitioning( partitioning: Option<&protobuf::Partitioning>, - ctx: &SessionContext, + ctx: &TaskContext, input_schema: &Schema, codec: &dyn PhysicalExtensionCodec, ) -> Result> { @@ -491,7 +486,7 @@ pub fn parse_protobuf_file_scan_schema( pub fn parse_protobuf_file_scan_config( proto: &protobuf::FileScanExecConf, - ctx: &SessionContext, + ctx: &TaskContext, codec: &dyn PhysicalExtensionCodec, file_source: Arc, ) -> Result { diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 04a4372c19..e7d8479c14 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -57,8 +57,7 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::datasource::sink::DataSinkExec; use datafusion::datasource::source::{DataSource, DataSourceExec}; -use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::execution::FunctionRegistry; +use datafusion::execution::{FunctionRegistry, TaskContext}; use datafusion::functions_table::generate_series::{ Empty, GenSeriesArgs, GenerateSeriesTable, GenericSeriesState, TimestampValue, }; @@ -98,7 +97,6 @@ use datafusion_common::config::TableParquetOptions; use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; -use datafusion::prelude::SessionContext; use prost::bytes::BufMut; use prost::Message; @@ -127,8 +125,8 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { fn try_into_physical_plan( &self, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let plan = self.physical_plan_type.as_ref().ok_or_else(|| { @@ -137,171 +135,118 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { )) })?; match plan { - PhysicalPlanType::Explain(explain) => self.try_into_explain_physical_plan( - explain, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::Projection(projection) => self - .try_into_projection_physical_plan( - projection, - ctx, - runtime, - extension_codec, - ), + PhysicalPlanType::Explain(explain) => { + self.try_into_explain_physical_plan(explain, ctx, extension_codec) + } + PhysicalPlanType::Projection(projection) => { + self.try_into_projection_physical_plan(projection, ctx, extension_codec) + } PhysicalPlanType::Filter(filter) => { - self.try_into_filter_physical_plan(filter, ctx, runtime, extension_codec) + self.try_into_filter_physical_plan(filter, ctx, extension_codec) } PhysicalPlanType::CsvScan(scan) => { - self.try_into_csv_scan_physical_plan(scan, ctx, runtime, extension_codec) + self.try_into_csv_scan_physical_plan(scan, ctx, extension_codec) } PhysicalPlanType::JsonScan(scan) => { - self.try_into_json_scan_physical_plan(scan, ctx, runtime, extension_codec) + self.try_into_json_scan_physical_plan(scan, ctx, extension_codec) } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] - PhysicalPlanType::ParquetScan(scan) => self - .try_into_parquet_scan_physical_plan(scan, ctx, runtime, extension_codec), + PhysicalPlanType::ParquetScan(scan) => { + self.try_into_parquet_scan_physical_plan(scan, ctx, extension_codec) + } #[cfg_attr(not(feature = "avro"), allow(unused_variables))] PhysicalPlanType::AvroScan(scan) => { - self.try_into_avro_scan_physical_plan(scan, ctx, runtime, extension_codec) + self.try_into_avro_scan_physical_plan(scan, ctx, extension_codec) + } + PhysicalPlanType::MemoryScan(scan) => { + self.try_into_memory_scan_physical_plan(scan, ctx, extension_codec) } - PhysicalPlanType::MemoryScan(scan) => self - .try_into_memory_scan_physical_plan(scan, ctx, runtime, extension_codec), PhysicalPlanType::CoalesceBatches(coalesce_batches) => self .try_into_coalesce_batches_physical_plan( coalesce_batches, ctx, - runtime, extension_codec, ), PhysicalPlanType::Merge(merge) => { - self.try_into_merge_physical_plan(merge, ctx, runtime, extension_codec) + self.try_into_merge_physical_plan(merge, ctx, extension_codec) + } + PhysicalPlanType::Repartition(repart) => { + self.try_into_repartition_physical_plan(repart, ctx, extension_codec) + } + PhysicalPlanType::GlobalLimit(limit) => { + self.try_into_global_limit_physical_plan(limit, ctx, extension_codec) + } + PhysicalPlanType::LocalLimit(limit) => { + self.try_into_local_limit_physical_plan(limit, ctx, extension_codec) + } + PhysicalPlanType::Window(window_agg) => { + self.try_into_window_physical_plan(window_agg, ctx, extension_codec) + } + PhysicalPlanType::Aggregate(hash_agg) => { + self.try_into_aggregate_physical_plan(hash_agg, ctx, extension_codec) + } + PhysicalPlanType::HashJoin(hashjoin) => { + self.try_into_hash_join_physical_plan(hashjoin, ctx, extension_codec) } - PhysicalPlanType::Repartition(repart) => self - .try_into_repartition_physical_plan( - repart, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::GlobalLimit(limit) => self - .try_into_global_limit_physical_plan( - limit, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::LocalLimit(limit) => self - .try_into_local_limit_physical_plan(limit, ctx, runtime, extension_codec), - PhysicalPlanType::Window(window_agg) => self.try_into_window_physical_plan( - window_agg, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::Aggregate(hash_agg) => self - .try_into_aggregate_physical_plan( - hash_agg, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::HashJoin(hashjoin) => self - .try_into_hash_join_physical_plan( - hashjoin, - ctx, - runtime, - extension_codec, - ), PhysicalPlanType::SymmetricHashJoin(sym_join) => self .try_into_symmetric_hash_join_physical_plan( sym_join, ctx, - runtime, extension_codec, ), PhysicalPlanType::Union(union) => { - self.try_into_union_physical_plan(union, ctx, runtime, extension_codec) + self.try_into_union_physical_plan(union, ctx, extension_codec) + } + PhysicalPlanType::Interleave(interleave) => { + self.try_into_interleave_physical_plan(interleave, ctx, extension_codec) + } + PhysicalPlanType::CrossJoin(crossjoin) => { + self.try_into_cross_join_physical_plan(crossjoin, ctx, extension_codec) } - PhysicalPlanType::Interleave(interleave) => self - .try_into_interleave_physical_plan( - interleave, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::CrossJoin(crossjoin) => self - .try_into_cross_join_physical_plan( - crossjoin, - ctx, - runtime, - extension_codec, - ), PhysicalPlanType::Empty(empty) => { - self.try_into_empty_physical_plan(empty, ctx, runtime, extension_codec) + self.try_into_empty_physical_plan(empty, ctx, extension_codec) } PhysicalPlanType::PlaceholderRow(placeholder) => self .try_into_placeholder_row_physical_plan( placeholder, ctx, - runtime, extension_codec, ), PhysicalPlanType::Sort(sort) => { - self.try_into_sort_physical_plan(sort, ctx, runtime, extension_codec) + self.try_into_sort_physical_plan(sort, ctx, extension_codec) } PhysicalPlanType::SortPreservingMerge(sort) => self - .try_into_sort_preserving_merge_physical_plan( - sort, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::Extension(extension) => self - .try_into_extension_physical_plan( - extension, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::NestedLoopJoin(join) => self - .try_into_nested_loop_join_physical_plan( - join, - ctx, - runtime, - extension_codec, - ), - PhysicalPlanType::Analyze(analyze) => self.try_into_analyze_physical_plan( - analyze, - ctx, - runtime, - extension_codec, - ), + .try_into_sort_preserving_merge_physical_plan(sort, ctx, extension_codec), + PhysicalPlanType::Extension(extension) => { + self.try_into_extension_physical_plan(extension, ctx, extension_codec) + } + PhysicalPlanType::NestedLoopJoin(join) => { + self.try_into_nested_loop_join_physical_plan(join, ctx, extension_codec) + } + PhysicalPlanType::Analyze(analyze) => { + self.try_into_analyze_physical_plan(analyze, ctx, extension_codec) + } PhysicalPlanType::JsonSink(sink) => { - self.try_into_json_sink_physical_plan(sink, ctx, runtime, extension_codec) + self.try_into_json_sink_physical_plan(sink, ctx, extension_codec) } PhysicalPlanType::CsvSink(sink) => { - self.try_into_csv_sink_physical_plan(sink, ctx, runtime, extension_codec) + self.try_into_csv_sink_physical_plan(sink, ctx, extension_codec) } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] - PhysicalPlanType::ParquetSink(sink) => self - .try_into_parquet_sink_physical_plan(sink, ctx, runtime, extension_codec), + PhysicalPlanType::ParquetSink(sink) => { + self.try_into_parquet_sink_physical_plan(sink, ctx, extension_codec) + } PhysicalPlanType::Unnest(unnest) => { - self.try_into_unnest_physical_plan(unnest, ctx, runtime, extension_codec) + self.try_into_unnest_physical_plan(unnest, ctx, extension_codec) + } + PhysicalPlanType::Cooperative(cooperative) => { + self.try_into_cooperative_physical_plan(cooperative, ctx, extension_codec) } - PhysicalPlanType::Cooperative(cooperative) => self - .try_into_cooperative_physical_plan( - cooperative, - ctx, - runtime, - extension_codec, - ), PhysicalPlanType::GenerateSeries(generate_series) => { self.try_into_generate_series_physical_plan(generate_series) } PhysicalPlanType::SortMergeJoin(sort_join) => { - self.try_into_sort_join(sort_join, ctx, runtime, extension_codec) + self.try_into_sort_join(sort_join, ctx, extension_codec) } } } @@ -546,8 +491,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_explain_physical_plan( &self, explain: &protobuf::ExplainExecNode, - _ctx: &SessionContext, - _runtime: &RuntimeEnv, + _ctx: &TaskContext, + _extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { Ok(Arc::new(ExplainExec::new( @@ -564,12 +509,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_projection_physical_plan( &self, projection: &protobuf::ProjectionExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&projection.input, ctx, runtime, extension_codec)?; + into_physical_plan(&projection.input, ctx, extension_codec)?; let exprs = projection .expr .iter() @@ -596,12 +541,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_filter_physical_plan( &self, filter: &protobuf::FilterExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&filter.input, ctx, runtime, extension_codec)?; + into_physical_plan(&filter.input, ctx, extension_codec)?; let predicate = filter .expr @@ -644,8 +589,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_csv_scan_physical_plan( &self, scan: &protobuf::CsvScanExecNode, - ctx: &SessionContext, - _runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let escape = @@ -691,8 +636,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_json_scan_physical_plan( &self, scan: &protobuf::JsonScanExecNode, - ctx: &SessionContext, - _runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let scan_conf = parse_protobuf_file_scan_config( @@ -708,8 +653,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_parquet_scan_physical_plan( &self, scan: &protobuf::ParquetScanExecNode, - ctx: &SessionContext, - _runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { #[cfg(feature = "parquet")] @@ -769,8 +714,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_avro_scan_physical_plan( &self, scan: &protobuf::AvroScanExecNode, - ctx: &SessionContext, - _runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { #[cfg(feature = "avro")] @@ -790,8 +735,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_memory_scan_physical_plan( &self, scan: &protobuf::MemoryScanExecNode, - ctx: &SessionContext, - _runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let partitions = scan @@ -841,12 +786,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_coalesce_batches_physical_plan( &self, coalesce_batches: &protobuf::CoalesceBatchesExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&coalesce_batches.input, ctx, runtime, extension_codec)?; + into_physical_plan(&coalesce_batches.input, ctx, extension_codec)?; Ok(Arc::new( CoalesceBatchesExec::new(input, coalesce_batches.target_batch_size as usize) .with_fetch(coalesce_batches.fetch.map(|f| f as usize)), @@ -856,12 +801,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_merge_physical_plan( &self, merge: &protobuf::CoalescePartitionsExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&merge.input, ctx, runtime, extension_codec)?; + into_physical_plan(&merge.input, ctx, extension_codec)?; Ok(Arc::new( CoalescePartitionsExec::new(input) .with_fetch(merge.fetch.map(|f| f as usize)), @@ -871,12 +816,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_repartition_physical_plan( &self, repart: &protobuf::RepartitionExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&repart.input, ctx, runtime, extension_codec)?; + into_physical_plan(&repart.input, ctx, extension_codec)?; let partitioning = parse_protobuf_partitioning( repart.partitioning.as_ref(), ctx, @@ -892,12 +837,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_global_limit_physical_plan( &self, limit: &protobuf::GlobalLimitExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&limit.input, ctx, runtime, extension_codec)?; + into_physical_plan(&limit.input, ctx, extension_codec)?; let fetch = if limit.fetch >= 0 { Some(limit.fetch as usize) } else { @@ -913,24 +858,24 @@ impl protobuf::PhysicalPlanNode { fn try_into_local_limit_physical_plan( &self, limit: &protobuf::LocalLimitExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&limit.input, ctx, runtime, extension_codec)?; + into_physical_plan(&limit.input, ctx, extension_codec)?; Ok(Arc::new(LocalLimitExec::new(input, limit.fetch as usize))) } fn try_into_window_physical_plan( &self, window_agg: &protobuf::WindowAggExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&window_agg.input, ctx, runtime, extension_codec)?; + into_physical_plan(&window_agg.input, ctx, extension_codec)?; let input_schema = input.schema(); let physical_window_expr: Vec> = window_agg @@ -983,12 +928,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_aggregate_physical_plan( &self, hash_agg: &protobuf::AggregateExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&hash_agg.input, ctx, runtime, extension_codec)?; + into_physical_plan(&hash_agg.input, ctx, extension_codec)?; let mode = protobuf::AggregateMode::try_from(hash_agg.mode).map_err(|_| { proto_error(format!( "Received a AggregateNode message with unknown AggregateMode {}", @@ -1151,14 +1096,14 @@ impl protobuf::PhysicalPlanNode { fn try_into_hash_join_physical_plan( &self, hashjoin: &protobuf::HashJoinExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let left: Arc = - into_physical_plan(&hashjoin.left, ctx, runtime, extension_codec)?; + into_physical_plan(&hashjoin.left, ctx, extension_codec)?; let right: Arc = - into_physical_plan(&hashjoin.right, ctx, runtime, extension_codec)?; + into_physical_plan(&hashjoin.right, ctx, extension_codec)?; let left_schema = left.schema(); let right_schema = right.schema(); let on: Vec<(PhysicalExprRef, PhysicalExprRef)> = hashjoin @@ -1269,12 +1214,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_symmetric_hash_join_physical_plan( &self, sym_join: &protobuf::SymmetricHashJoinExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let left = into_physical_plan(&sym_join.left, ctx, runtime, extension_codec)?; - let right = into_physical_plan(&sym_join.right, ctx, runtime, extension_codec)?; + let left = into_physical_plan(&sym_join.left, ctx, extension_codec)?; + let right = into_physical_plan(&sym_join.right, ctx, extension_codec)?; let left_schema = left.schema(); let right_schema = right.schema(); let on = sym_join @@ -1397,13 +1342,13 @@ impl protobuf::PhysicalPlanNode { fn try_into_union_physical_plan( &self, union: &protobuf::UnionExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let mut inputs: Vec> = vec![]; for input in &union.inputs { - inputs.push(input.try_into_physical_plan(ctx, runtime, extension_codec)?); + inputs.push(input.try_into_physical_plan(ctx, extension_codec)?); } UnionExec::try_new(inputs) } @@ -1411,13 +1356,13 @@ impl protobuf::PhysicalPlanNode { fn try_into_interleave_physical_plan( &self, interleave: &protobuf::InterleaveExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let mut inputs: Vec> = vec![]; for input in &interleave.inputs { - inputs.push(input.try_into_physical_plan(ctx, runtime, extension_codec)?); + inputs.push(input.try_into_physical_plan(ctx, extension_codec)?); } Ok(Arc::new(InterleaveExec::try_new(inputs)?)) } @@ -1425,22 +1370,22 @@ impl protobuf::PhysicalPlanNode { fn try_into_cross_join_physical_plan( &self, crossjoin: &protobuf::CrossJoinExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let left: Arc = - into_physical_plan(&crossjoin.left, ctx, runtime, extension_codec)?; + into_physical_plan(&crossjoin.left, ctx, extension_codec)?; let right: Arc = - into_physical_plan(&crossjoin.right, ctx, runtime, extension_codec)?; + into_physical_plan(&crossjoin.right, ctx, extension_codec)?; Ok(Arc::new(CrossJoinExec::new(left, right))) } fn try_into_empty_physical_plan( &self, empty: &protobuf::EmptyExecNode, - _ctx: &SessionContext, - _runtime: &RuntimeEnv, + _ctx: &TaskContext, + _extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let schema = Arc::new(convert_required!(empty.schema)?); @@ -1450,8 +1395,8 @@ impl protobuf::PhysicalPlanNode { fn try_into_placeholder_row_physical_plan( &self, placeholder: &protobuf::PlaceholderRowExecNode, - _ctx: &SessionContext, - _runtime: &RuntimeEnv, + _ctx: &TaskContext, + _extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let schema = Arc::new(convert_required!(placeholder.schema)?); @@ -1461,11 +1406,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_sort_physical_plan( &self, sort: &protobuf::SortExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = into_physical_plan(&sort.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&sort.input, ctx, extension_codec)?; let exprs = sort .expr .iter() @@ -1513,11 +1458,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_sort_preserving_merge_physical_plan( &self, sort: &protobuf::SortPreservingMergeExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = into_physical_plan(&sort.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&sort.input, ctx, extension_codec)?; let exprs = sort .expr .iter() @@ -1566,14 +1511,14 @@ impl protobuf::PhysicalPlanNode { fn try_into_extension_physical_plan( &self, extension: &protobuf::PhysicalExtensionNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let inputs: Vec> = extension .inputs .iter() - .map(|i| i.try_into_physical_plan(ctx, runtime, extension_codec)) + .map(|i| i.try_into_physical_plan(ctx, extension_codec)) .collect::>()?; let extension_node = @@ -1585,14 +1530,14 @@ impl protobuf::PhysicalPlanNode { fn try_into_nested_loop_join_physical_plan( &self, join: &protobuf::NestedLoopJoinExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let left: Arc = - into_physical_plan(&join.left, ctx, runtime, extension_codec)?; + into_physical_plan(&join.left, ctx, extension_codec)?; let right: Arc = - into_physical_plan(&join.right, ctx, runtime, extension_codec)?; + into_physical_plan(&join.right, ctx, extension_codec)?; let join_type = protobuf::JoinType::try_from(join.join_type).map_err(|_| { proto_error(format!( "Received a NestedLoopJoinExecNode message with unknown JoinType {}", @@ -1659,12 +1604,12 @@ impl protobuf::PhysicalPlanNode { fn try_into_analyze_physical_plan( &self, analyze: &protobuf::AnalyzeExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let input: Arc = - into_physical_plan(&analyze.input, ctx, runtime, extension_codec)?; + into_physical_plan(&analyze.input, ctx, extension_codec)?; Ok(Arc::new(AnalyzeExec::new( analyze.verbose, analyze.show_statistics, @@ -1676,11 +1621,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_json_sink_physical_plan( &self, sink: &protobuf::JsonSinkExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = into_physical_plan(&sink.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&sink.input, ctx, extension_codec)?; let data_sink: JsonSink = sink .sink @@ -1714,11 +1659,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_csv_sink_physical_plan( &self, sink: &protobuf::CsvSinkExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = into_physical_plan(&sink.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&sink.input, ctx, extension_codec)?; let data_sink: CsvSink = sink .sink @@ -1752,13 +1697,13 @@ impl protobuf::PhysicalPlanNode { fn try_into_parquet_sink_physical_plan( &self, sink: &protobuf::ParquetSinkExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { #[cfg(feature = "parquet")] { - let input = into_physical_plan(&sink.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&sink.input, ctx, extension_codec)?; let data_sink: ParquetSink = sink .sink @@ -1795,11 +1740,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_unnest_physical_plan( &self, unnest: &protobuf::UnnestExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = into_physical_plan(&unnest.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&unnest.input, ctx, extension_codec)?; Ok(Arc::new(UnnestExec::new( input, @@ -1826,13 +1771,13 @@ impl protobuf::PhysicalPlanNode { fn try_into_sort_join( &self, sort_join: &SortMergeJoinExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let left = into_physical_plan(&sort_join.left, ctx, runtime, extension_codec)?; + let left = into_physical_plan(&sort_join.left, ctx, extension_codec)?; let left_schema = left.schema(); - let right = into_physical_plan(&sort_join.right, ctx, runtime, extension_codec)?; + let right = into_physical_plan(&sort_join.right, ctx, extension_codec)?; let right_schema = right.schema(); let filter = sort_join @@ -2003,12 +1948,11 @@ impl protobuf::PhysicalPlanNode { fn try_into_cooperative_physical_plan( &self, field_stream: &protobuf::CooperativeExecNode, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { - let input = - into_physical_plan(&field_stream.input, ctx, runtime, extension_codec)?; + let input = into_physical_plan(&field_stream.input, ctx, extension_codec)?; Ok(Arc::new(CooperativeExec::new(input))) } @@ -3276,8 +3220,8 @@ pub trait AsExecutionPlan: Debug + Send + Sync + Clone { fn try_into_physical_plan( &self, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result>; @@ -3294,7 +3238,7 @@ pub trait PhysicalExtensionCodec: Debug + Send + Sync { &self, buf: &[u8], inputs: &[Arc], - registry: &dyn FunctionRegistry, + ctx: &TaskContext, ) -> Result>; fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()>; @@ -3350,7 +3294,7 @@ impl PhysicalExtensionCodec for DefaultPhysicalExtensionCodec { &self, _buf: &[u8], _inputs: &[Arc], - _registry: &dyn FunctionRegistry, + _ctx: &TaskContext, ) -> Result> { not_impl_err!("PhysicalExtensionCodec is not provided") } @@ -3452,9 +3396,9 @@ impl PhysicalExtensionCodec for ComposedPhysicalExtensionCodec { &self, buf: &[u8], inputs: &[Arc], - registry: &dyn FunctionRegistry, + ctx: &TaskContext, ) -> Result> { - self.decode_protobuf(buf, |codec, data| codec.try_decode(data, inputs, registry)) + self.decode_protobuf(buf, |codec, data| codec.try_decode(data, inputs, ctx)) } fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { @@ -3480,12 +3424,12 @@ impl PhysicalExtensionCodec for ComposedPhysicalExtensionCodec { fn into_physical_plan( node: &Option>, - ctx: &SessionContext, - runtime: &RuntimeEnv, + ctx: &TaskContext, + extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { if let Some(field) = node { - field.try_into_physical_plan(ctx, runtime, extension_codec) + field.try_into_physical_plan(ctx, extension_codec) } else { Err(proto_error("Missing required field in protobuf")) } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index c76036a434..d0f25a85f7 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -32,6 +32,7 @@ use datafusion::execution::options::ArrowReadOptions; use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion; use datafusion::optimizer::Optimizer; use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_functions_aggregate::sum::sum_distinct; use prost::Message; use std::any::Any; use std::collections::HashMap; @@ -82,8 +83,8 @@ use datafusion_expr::{ }; use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::expr_fn::{ - approx_distinct, array_agg, avg, bit_and, bit_or, bit_xor, bool_and, bool_or, corr, - nth_value, + approx_distinct, array_agg, avg, avg_distinct, bit_and, bit_or, bit_xor, bool_and, + bool_or, corr, nth_value, }; use datafusion_functions_aggregate::string_agg::string_agg; use datafusion_functions_window_common::field::WindowUDFFieldArgs; @@ -967,10 +968,12 @@ async fn roundtrip_expr_api() -> Result<()> { functions_window::nth_value::last_value(lit(1)), functions_window::nth_value::nth_value(lit(1), 1), avg(lit(1.5)), + avg_distinct(lit(1.5)), covar_samp(lit(1.5), lit(2.2)), covar_pop(lit(1.5), lit(2.2)), corr(lit(1.5), lit(2.2)), sum(lit(1)), + sum_distinct(lit(1)), max(lit(1)), median(lit(2)), min(lit(2)), @@ -1587,7 +1590,7 @@ fn round_trip_scalar_values_and_data_types() { assert_eq!( dt, roundtrip, "DataType was not the same after round trip!\n\n\ - Input: {dt:?}\n\nRoundtrip: {roundtrip:?}" + Input: {dt}\n\nRoundtrip: {roundtrip:?}" ); } } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 4b4403a5f3..c88c62952a 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -17,7 +17,6 @@ use std::any::Any; use std::fmt::{Display, Formatter}; -use std::ops::Deref; use std::sync::Arc; use std::vec; @@ -53,7 +52,7 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::datasource::sink::DataSinkExec; use datafusion::datasource::source::DataSourceExec; -use datafusion::execution::FunctionRegistry; +use datafusion::execution::TaskContext; use datafusion::functions_aggregate::count::count_udaf; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::functions_window::nth_value::nth_value_udwf; @@ -138,9 +137,8 @@ fn roundtrip_test_and_return( let proto: protobuf::PhysicalPlanNode = protobuf::PhysicalPlanNode::try_from_physical_plan(exec_plan.clone(), codec) .expect("to proto"); - let runtime = ctx.runtime_env(); let result_exec_plan: Arc = proto - .try_into_physical_plan(ctx, runtime.deref(), codec) + .try_into_physical_plan(&ctx.task_ctx(), codec) .expect("from proto"); pretty_assertions::assert_eq!( @@ -1024,7 +1022,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { &self, _buf: &[u8], _inputs: &[Arc], - _registry: &dyn FunctionRegistry, + _ctx: &TaskContext, ) -> Result> { unreachable!() } @@ -1132,7 +1130,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec { &self, _buf: &[u8], _inputs: &[Arc], - _registry: &dyn FunctionRegistry, + _ctx: &TaskContext, ) -> Result> { not_impl_err!("No extension codec provided") } @@ -1736,11 +1734,8 @@ async fn roundtrip_coalesce() -> Result<()> { )?; let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice()) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let restored = node.try_into_physical_plan( - &ctx, - ctx.runtime_env().as_ref(), - &DefaultPhysicalExtensionCodec {}, - )?; + let restored = + node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?; assert_eq!( plan.schema(), @@ -1775,11 +1770,8 @@ async fn roundtrip_generate_series() -> Result<()> { )?; let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice()) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let restored = node.try_into_physical_plan( - &ctx, - ctx.runtime_env().as_ref(), - &DefaultPhysicalExtensionCodec {}, - )?; + let restored = + node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?; assert_eq!( plan.schema(), @@ -1901,11 +1893,7 @@ async fn roundtrip_physical_plan_node() { .unwrap(); let plan = node - .try_into_physical_plan( - &ctx, - &ctx.runtime_env(), - &DefaultPhysicalExtensionCodec {}, - ) + .try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {}) .unwrap(); let _ = plan.execute(0, ctx.task_ctx()).unwrap(); @@ -1985,7 +1973,7 @@ async fn test_serialize_deserialize_tpch_queries() -> Result<()> { // deserialize the physical plan let _deserialized_plan = - proto.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), &codec)?; + proto.try_into_physical_plan(&ctx.task_ctx(), &codec)?; } } @@ -2104,8 +2092,7 @@ async fn test_tpch_part_in_list_query_with_real_parquet_data() -> Result<()> { let proto = PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?; // This will fail with the bug, but should succeed when fixed - let _deserialized_plan = - proto.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), &codec)?; + let _deserialized_plan = proto.try_into_physical_plan(&ctx.task_ctx(), &codec)?; Ok(()) } @@ -2133,11 +2120,8 @@ async fn analyze_roundtrip_unoptimized() -> Result<()> { let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice()) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let unoptimized = node.try_into_physical_plan( - &ctx, - ctx.runtime_env().as_ref(), - &DefaultPhysicalExtensionCodec {}, - )?; + let unoptimized = + node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?; let physical_planner = datafusion::physical_planner::DefaultPhysicalPlanner::default(); diff --git a/datafusion/pruning/Cargo.toml b/datafusion/pruning/Cargo.toml index 095a5b692e..2429123bdf 100644 --- a/datafusion/pruning/Cargo.toml +++ b/datafusion/pruning/Cargo.toml @@ -1,6 +1,7 @@ [package] name = "datafusion-pruning" description = "DataFusion Pruning Logic" +readme = "README.md" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } diff --git a/datafusion/pruning/README.md b/datafusion/pruning/README.md new file mode 100644 index 0000000000..4db509193d --- /dev/null +++ b/datafusion/pruning/README.md @@ -0,0 +1,34 @@ + + +# Apache DataFusion Pruning Logic + +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate is a submodule of DataFusion that contains pruning logic, to analyze filter expressions with +statistics such as min/max values and null counts, proving files / large subsections of files can be skipped +without reading the actual data. + +Most projects should use the [`datafusion`] crate directly, which re-exports +this module. If you are already using the [`datafusion`] crate, there is no +reason to use this crate directly in your project as well. + +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 5e92dbe227..f0f87ffe1e 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -1094,8 +1094,8 @@ fn rewrite_expr_to_prunable( Ok((Arc::clone(column_expr), op, Arc::clone(scalar_expr))) } else if let Some(cast) = column_expr_any.downcast_ref::() { // `cast(col) op lit()` - let arrow_schema: SchemaRef = schema.clone().into(); - let from_type = cast.expr().data_type(&arrow_schema)?; + let arrow_schema = schema.as_arrow(); + let from_type = cast.expr().data_type(arrow_schema)?; verify_support_type_for_prune(&from_type, cast.cast_type())?; let (left, op, right) = rewrite_expr_to_prunable(cast.expr(), op, scalar_expr, schema)?; @@ -1109,8 +1109,8 @@ fn rewrite_expr_to_prunable( column_expr_any.downcast_ref::() { // `try_cast(col) op lit()` - let arrow_schema: SchemaRef = schema.clone().into(); - let from_type = try_cast.expr().data_type(&arrow_schema)?; + let arrow_schema = schema.as_arrow(); + let from_type = try_cast.expr().data_type(arrow_schema)?; verify_support_type_for_prune(&from_type, try_cast.cast_type())?; let (left, op, right) = rewrite_expr_to_prunable(try_cast.expr(), op, scalar_expr, schema)?; diff --git a/datafusion/session/Cargo.toml b/datafusion/session/Cargo.toml index 09bfbe1c48..0489da61ee 100644 --- a/datafusion/session/Cargo.toml +++ b/datafusion/session/Cargo.toml @@ -18,11 +18,11 @@ [package] name = "datafusion-session" description = "datafusion-session" +readme = "README.md" authors.workspace = true edition.workspace = true homepage.workspace = true license.workspace = true -readme.workspace = true repository.workspace = true rust-version.workspace = true version.workspace = true diff --git a/datafusion/session/README.md b/datafusion/session/README.md index f029c79736..4bb605b1e1 100644 --- a/datafusion/session/README.md +++ b/datafusion/session/README.md @@ -17,9 +17,9 @@ under the License. --> -# DataFusion Session +# Apache DataFusion Session -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. This crate provides **session-related abstractions** used in the DataFusion query engine. A _session_ represents the runtime context for query execution, including configuration, runtime environment, function registry, and planning. @@ -27,5 +27,6 @@ Most projects should use the [`datafusion`] crate directly, which re-exports this module. If you are already using the [`datafusion`] crate, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion diff --git a/datafusion/spark/README.md b/datafusion/spark/README.md index c92ada0ab4..7cb24084cd 100644 --- a/datafusion/spark/README.md +++ b/datafusion/spark/README.md @@ -17,9 +17,15 @@ specific language governing permissions and limitations under the License. --> -# datafusion-spark: Spark-compatible Expressions +# Apache DataFusion Spark-compatible Expressions -This crate provides Apache Spark-compatible expressions for use with DataFusion. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate is a submodule of DataFusion that provides [Apache Spark] compatible expressions for use with DataFusion. + +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[apache spark]: https://spark.apache.org/ ## Testing Guide @@ -29,12 +35,15 @@ or `coerce_types`) is not applied. Therefore, direct invocation tests should only be used to verify that the function is correctly implemented. Please be sure to add additional tests beyond direct invocation. -For more detailed testing guidelines, refer to -the [Spark SQLLogicTest README](../sqllogictest/test_files/spark/README.md). +For more detailed testing guidelines, refer to the [Spark SQLLogicTest README]. ## Implementation References When implementing Spark-compatible functions, you can check if there are existing implementations in -the [Sail](https://github.com/lakehq/sail) or [Comet](https://github.com/apache/datafusion-comet) projects first. +the [Sail] or [Comet] projects first. If you do port functionality from these sources, make sure to port over the corresponding tests too, to ensure correctness and compatibility. + +[spark sqllogictest readme]: ../sqllogictest/test_files/spark/README.md +[sail]: https://github.com/lakehq/sail +[comet]: https://github.com/apache/datafusion-comet diff --git a/datafusion/spark/src/function/array/spark_array.rs b/datafusion/spark/src/function/array/spark_array.rs index 1644cde7ab..bf5842cb5a 100644 --- a/datafusion/spark/src/function/array/spark_array.rs +++ b/datafusion/spark/src/function/array/spark_array.rs @@ -133,7 +133,7 @@ impl ScalarUDFImpl for SparkArray { if let Some(coerced_type) = coerced_type { Ok(coerced_type) } else { - plan_err!("Coercion from {acc:?} to {x:?} failed.") + plan_err!("Coercion from {acc} to {x} failed.") } })?; Ok(vec![new_type; arg_types.len()]) diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs index 966b0930f0..15bd33229a 100644 --- a/datafusion/spark/src/function/bitmap/bitmap_count.rs +++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs @@ -27,12 +27,13 @@ use arrow::datatypes::DataType::{ Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary, }; use datafusion_common::utils::take_function_args; -use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::{ - ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, + Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignatureClass, Volatility, }; +use datafusion_functions::downcast_arg; use datafusion_functions::utils::make_scalar_function; -use datafusion_functions::{downcast_arg, downcast_named_arg}; #[derive(Debug, PartialEq, Eq, Hash)] pub struct BitmapCount { @@ -48,8 +49,10 @@ impl Default for BitmapCount { impl BitmapCount { pub fn new() -> Self { Self { - // TODO: add definitive TypeSignature after https://github.com/apache/datafusion/issues/17291 is done - signature: Signature::any(1, Volatility::Immutable), + signature: Signature::coercible( + vec![Coercion::new_exact(TypeSignatureClass::Binary)], + Volatility::Immutable, + ), } } } @@ -67,15 +70,8 @@ impl ScalarUDFImpl for BitmapCount { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> Result { - match arg_types.first() { - Some(Binary | BinaryView | FixedSizeBinary(_) | LargeBinary) => Ok(Int64), - Some(data_type) => plan_err!( - "bitmap_count expects Binary/BinaryView/FixedSizeBinary/LargeBinary as argument, got {:?}", - data_type - ), - None => internal_err!("bitmap_count does not support zero arguments"), - } + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Int64) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { @@ -105,7 +101,7 @@ pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result { downcast_and_count_ones!(input_array, FixedSizeBinaryArray) } data_type => { - internal_err!("bitmap_count does not support {:?}", data_type) + internal_err!("bitmap_count does not support {data_type}") } }; diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 73566b9dbd..ba44d3bc0a 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -140,7 +140,7 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { } _ => { plan_err!( - "bit_count function does not support data type: {:?}", + "bit_count function does not support data type: {}", value_array.data_type() ) } diff --git a/datafusion/spark/src/function/bitwise/bit_shift.rs b/datafusion/spark/src/function/bitwise/bit_shift.rs index 79f62587c0..bb645b7660 100644 --- a/datafusion/spark/src/function/bitwise/bit_shift.rs +++ b/datafusion/spark/src/function/bitwise/bit_shift.rs @@ -194,7 +194,7 @@ trait BitShiftUDF: ScalarUDFImpl { } _ => { plan_err!( - "{} function does not support data type: {:?}", + "{} function does not support data type: {}", self.name(), value_array.data_type() ) diff --git a/datafusion/spark/src/function/datetime/date_sub.rs b/datafusion/spark/src/function/datetime/date_sub.rs index aa10c05b8a..c19d04e617 100644 --- a/datafusion/spark/src/function/datetime/date_sub.rs +++ b/datafusion/spark/src/function/datetime/date_sub.rs @@ -114,7 +114,7 @@ fn spark_date_sub(args: &[ArrayRef]) -> Result { } _ => { return internal_err!( - "Spark `date_add` function: argument must be int8, int16, int32, got {:?}", + "Spark `date_sub` function: argument must be int8, int16, int32, got {:?}", days_arg.data_type() ); } diff --git a/datafusion/spark/src/function/datetime/make_dt_interval.rs b/datafusion/spark/src/function/datetime/make_dt_interval.rs new file mode 100644 index 0000000000..c44ab69b8b --- /dev/null +++ b/datafusion/spark/src/function/datetime/make_dt_interval.rs @@ -0,0 +1,485 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayRef, AsArray, DurationMicrosecondBuilder, PrimitiveArray, +}; +use arrow::datatypes::TimeUnit::Microsecond; +use arrow::datatypes::{DataType, Float64Type, Int32Type}; +use datafusion_common::{ + exec_err, plan_datafusion_err, DataFusionError, Result, ScalarValue, +}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_functions::utils::make_scalar_function; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkMakeDtInterval { + signature: Signature, +} + +impl Default for SparkMakeDtInterval { + fn default() -> Self { + Self::new() + } +} + +impl SparkMakeDtInterval { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkMakeDtInterval { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_dt_interval" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + /// Note the return type is `DataType::Duration(TimeUnit::Microsecond)` and not `DataType::Interval(DayTime)` as you might expect. + /// This is because `DataType::Interval(DayTime)` has precision only to the millisecond, whilst Spark's `DayTimeIntervalType` has + /// precision to the microsecond. We use `DataType::Duration(TimeUnit::Microsecond)` in order to not lose any precision. See the + /// [Sail compatibility doc] for reference. + /// + /// [Sail compatibility doc]: https://github.com/lakehq/sail/blob/dc5368daa24d40a7758a299e1ba8fc985cb29108/docs/guide/dataframe/data-types/compatibility.md?plain=1#L260 + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Duration(Microsecond)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + if args.args.is_empty() { + return Ok(ColumnarValue::Scalar(ScalarValue::DurationMicrosecond( + Some(0), + ))); + } + make_scalar_function(make_dt_interval_kernel, vec![])(&args.args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + if arg_types.len() > 4 { + return exec_err!( + "make_dt_interval expects between 0 and 4 arguments, got {}", + arg_types.len() + ); + } + + Ok((0..arg_types.len()) + .map(|i| { + if i == 3 { + DataType::Float64 + } else { + DataType::Int32 + } + }) + .collect()) + } +} + +fn make_dt_interval_kernel(args: &[ArrayRef]) -> Result { + let n_rows = args[0].len(); + let days = args[0] + .as_primitive_opt::() + .ok_or_else(|| plan_datafusion_err!("make_dt_interval arg[0] must be Int32"))?; + let hours: Option<&PrimitiveArray> = args + .get(1) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[1] must be Int32") + }) + }) + .transpose()?; + let mins: Option<&PrimitiveArray> = args + .get(2) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[2] must be Int32") + }) + }) + .transpose()?; + let secs: Option<&PrimitiveArray> = args + .get(3) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[3] must be Float64") + }) + }) + .transpose()?; + let mut builder = DurationMicrosecondBuilder::with_capacity(n_rows); + + for i in 0..n_rows { + // if one column is NULL → result NULL + let any_null_present = days.is_null(i) + || hours.as_ref().is_some_and(|a| a.is_null(i)) + || mins.as_ref().is_some_and(|a| a.is_null(i)) + || secs + .as_ref() + .is_some_and(|a| a.is_null(i) || !a.value(i).is_finite()); + + if any_null_present { + builder.append_null(); + continue; + } + + // default values 0 or 0.0 + let d = days.value(i); + let h = hours.as_ref().map_or(0, |a| a.value(i)); + let mi = mins.as_ref().map_or(0, |a| a.value(i)); + let s = secs.as_ref().map_or(0.0, |a| a.value(i)); + + match make_interval_dt_nano(d, h, mi, s) { + Some(v) => builder.append_value(v), + None => { + builder.append_null(); + continue; + } + } + } + + Ok(Arc::new(builder.finish())) +} +fn make_interval_dt_nano(day: i32, hour: i32, min: i32, sec: f64) -> Option { + const HOURS_PER_DAY: i32 = 24; + const MINS_PER_HOUR: i32 = 60; + const SECS_PER_MINUTE: i64 = 60; + const MICROS_PER_SEC: i64 = 1_000_000; + + let total_hours: i32 = day + .checked_mul(HOURS_PER_DAY) + .and_then(|v| v.checked_add(hour))?; + + let total_mins: i32 = total_hours + .checked_mul(MINS_PER_HOUR) + .and_then(|v| v.checked_add(min))?; + + let mut sec_whole: i64 = sec.trunc() as i64; + let sec_frac: f64 = sec - (sec_whole as f64); + let mut frac_us: i64 = (sec_frac * (MICROS_PER_SEC as f64)).round() as i64; + + if frac_us.abs() >= MICROS_PER_SEC { + if frac_us > 0 { + frac_us -= MICROS_PER_SEC; + sec_whole = sec_whole.checked_add(1)?; + } else { + frac_us += MICROS_PER_SEC; + sec_whole = sec_whole.checked_sub(1)?; + } + } + + let total_secs: i64 = (total_mins as i64) + .checked_mul(SECS_PER_MINUTE) + .and_then(|v| v.checked_add(sec_whole))?; + + let total_us = total_secs + .checked_mul(MICROS_PER_SEC) + .and_then(|v| v.checked_add(frac_us))?; + + Some(total_us) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::{DurationMicrosecondArray, Float64Array, Int32Array}; + use arrow::datatypes::DataType::Duration; + use arrow::datatypes::Field; + use arrow::datatypes::TimeUnit::Microsecond; + use datafusion_common::{DataFusionError, Result}; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; + + use super::*; + + fn run_make_dt_interval(arrs: Vec) -> Result { + make_dt_interval_kernel(&arrs) + } + + #[test] + fn nulls_propagate_per_row() -> Result<()> { + let days = Arc::new(Int32Array::from(vec![ + None, + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + ])) as ArrayRef; + + let hours = Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + ])) as ArrayRef; + + let mins = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + Some(6), + Some(7), + ])) as ArrayRef; + + let secs = Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + Some(3.0), + None, + Some(f64::NAN), + Some(f64::INFINITY), + Some(f64::NEG_INFINITY), + ])) as ArrayRef; + + let out = run_make_dt_interval(vec![days, hours, mins, secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + Ok(()) + } + + #[test] + fn error_months_overflow_should_be_null() -> Result<()> { + // months = year*12 + month → NULL + + let days = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef; + + let hours = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef; + + let mins = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef; + + let secs = Arc::new(Float64Array::from(vec![Some(1.0)])) as ArrayRef; + + let out = run_make_dt_interval(vec![days, hours, mins, secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + + Ok(()) + } + + fn invoke_make_dt_interval_with_args( + args: Vec, + number_rows: usize, + ) -> Result { + let arg_fields = args + .iter() + .map(|arg| Field::new("a", arg.data_type(), true).into()) + .collect::>(); + let args = ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", Duration(Microsecond), true).into(), + config_options: Arc::new(Default::default()), + }; + SparkMakeDtInterval::new().invoke_with_args(args) + } + + #[test] + fn zero_args_returns_zero_duration() -> Result<()> { + let number_rows: usize = 3; + + let res: ColumnarValue = invoke_make_dt_interval_with_args(vec![], number_rows)?; + let arr = res.into_array(number_rows)?; + let arr = arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(arr.len(), number_rows); + for i in 0..number_rows { + assert!(!arr.is_null(i)); + assert_eq!(arr.value(i), 0_i64); + } + Ok(()) + } + + #[test] + fn one_day_minus_24_hours_equals_zero() -> Result<()> { + let arr_days = Arc::new(Int32Array::from(vec![Some(1), Some(-1)])) as ArrayRef; + let arr_hours = Arc::new(Int32Array::from(vec![Some(-24), Some(24)])) as ArrayRef; + let arr_mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let arr_secs = + Arc::new(Float64Array::from(vec![Some(0.0), Some(0.0)])) as ArrayRef; + + let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(out.len(), 2); + assert_eq!(out.null_count(), 0); + assert_eq!(out.value(0), 0_i64); + assert_eq!(out.value(1), 0_i64); + Ok(()) + } + + #[test] + fn one_hour_minus_60_mins_equals_zero() -> Result<()> { + let arr_days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let arr_hours = Arc::new(Int32Array::from(vec![Some(-1), Some(1)])) as ArrayRef; + let arr_mins = Arc::new(Int32Array::from(vec![Some(60), Some(-60)])) as ArrayRef; + let arr_secs = + Arc::new(Float64Array::from(vec![Some(0.0), Some(0.0)])) as ArrayRef; + + let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(out.len(), 2); + assert_eq!(out.null_count(), 0); + assert_eq!(out.value(0), 0_i64); + assert_eq!(out.value(1), 0_i64); + Ok(()) + } + + #[test] + fn one_mins_minus_60_secs_equals_zero() -> Result<()> { + let arr_days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let arr_hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let arr_mins = Arc::new(Int32Array::from(vec![Some(-1), Some(1)])) as ArrayRef; + let arr_secs = + Arc::new(Float64Array::from(vec![Some(60.0), Some(-60.0)])) as ArrayRef; + + let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(out.len(), 2); + assert_eq!(out.null_count(), 0); + assert_eq!(out.value(0), 0_i64); + assert_eq!(out.value(1), 0_i64); + Ok(()) + } + + #[test] + fn frac_carries_up_to_next_second_positive() -> Result<()> { + // 0.9999995s → 1_000_000 µs (carry a +1s) + let days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let secs = Arc::new(Float64Array::from(vec![ + Some(0.999_999_5), + Some(0.999_999_4), + ])) as ArrayRef; + + let out = run_make_dt_interval(vec![days, hours, mins, secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(out.len(), 2); + assert_eq!(out.value(0), 1_000_000); + assert_eq!(out.value(1), 999_999); + Ok(()) + } + + #[test] + fn frac_carries_down_to_prev_second_negative() -> Result<()> { + // -0.9999995s → -1_000_000 µs (carry a −1s) + let days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef; + let secs = Arc::new(Float64Array::from(vec![ + Some(-0.999_999_5), + Some(-0.999_999_4), + ])) as ArrayRef; + + let out = run_make_dt_interval(vec![days, hours, mins, secs])?; + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected DurationMicrosecondArray".into()) + })?; + + assert_eq!(out.len(), 2); + assert_eq!(out.value(0), -1_000_000); + assert_eq!(out.value(1), -999_999); + Ok(()) + } + + #[test] + fn no_more_than_4_params() -> Result<()> { + let udf = SparkMakeDtInterval::new(); + + let arg_types = vec![ + DataType::Int32, + DataType::Int32, + DataType::Int32, + DataType::Float64, + DataType::Int32, + ]; + + let res = udf.coerce_types(&arg_types); + + assert!( + matches!(res, Err(DataFusionError::Execution(_))), + "make_dt_interval expects between 0 and 4 arguments, got 5" + ); + + Ok(()) + } +} diff --git a/datafusion/spark/src/function/datetime/make_interval.rs b/datafusion/spark/src/function/datetime/make_interval.rs new file mode 100644 index 0000000000..c66f97ff5c --- /dev/null +++ b/datafusion/spark/src/function/datetime/make_interval.rs @@ -0,0 +1,581 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, IntervalMonthDayNanoBuilder, PrimitiveArray}; +use arrow::datatypes::DataType::Interval; +use arrow::datatypes::IntervalUnit::MonthDayNano; +use arrow::datatypes::{DataType, IntervalMonthDayNano}; +use datafusion_common::{ + exec_err, plan_datafusion_err, DataFusionError, Result, ScalarValue, +}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_functions::utils::make_scalar_function; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkMakeInterval { + signature: Signature, +} + +impl Default for SparkMakeInterval { + fn default() -> Self { + Self::new() + } +} + +impl SparkMakeInterval { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkMakeInterval { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_interval" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Interval(MonthDayNano)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + if args.args.is_empty() { + return Ok(ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano( + Some(IntervalMonthDayNano::new(0, 0, 0)), + ))); + } + make_scalar_function(make_interval_kernel, vec![])(&args.args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + let length = arg_types.len(); + match length { + x if x > 7 => { + exec_err!( + "make_interval expects between 0 and 7 arguments, got {}", + arg_types.len() + ) + } + _ => Ok((0..arg_types.len()) + .map(|i| { + if i == 6 { + DataType::Float64 + } else { + DataType::Int32 + } + }) + .collect()), + } + } +} + +fn make_interval_kernel(args: &[ArrayRef]) -> Result { + use arrow::array::AsArray; + use arrow::datatypes::{Float64Type, Int32Type}; + + let n_rows = args[0].len(); + + let years = args[0] + .as_primitive_opt::() + .ok_or_else(|| plan_datafusion_err!("make_interval arg[0] must be Int32"))?; + let months = args + .get(1) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[1] must be Int32") + }) + }) + .transpose()?; + let weeks = args + .get(2) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[2] must be Int32") + }) + }) + .transpose()?; + let days: Option<&PrimitiveArray> = args + .get(3) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[3] must be Int32") + }) + }) + .transpose()?; + let hours: Option<&PrimitiveArray> = args + .get(4) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[4] must be Int32") + }) + }) + .transpose()?; + let mins: Option<&PrimitiveArray> = args + .get(5) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[5] must be Int32") + }) + }) + .transpose()?; + let secs: Option<&PrimitiveArray> = args + .get(6) + .map(|a| { + a.as_primitive_opt::().ok_or_else(|| { + plan_datafusion_err!("make_dt_interval arg[6] must be Float64") + }) + }) + .transpose()?; + + let mut builder = IntervalMonthDayNanoBuilder::with_capacity(n_rows); + + for i in 0..n_rows { + // if one column is NULL → result NULL + let any_null_present = years.is_null(i) + || months.as_ref().is_some_and(|a| a.is_null(i)) + || weeks.as_ref().is_some_and(|a| a.is_null(i)) + || days.as_ref().is_some_and(|a| a.is_null(i)) + || hours.as_ref().is_some_and(|a| a.is_null(i)) + || mins.as_ref().is_some_and(|a| a.is_null(i)) + || secs + .as_ref() + .is_some_and(|a| a.is_null(i) || !a.value(i).is_finite()); + + if any_null_present { + builder.append_null(); + continue; + } + + // default values 0 or 0.0 + let y = years.value(i); + let mo = months.as_ref().map_or(0, |a| a.value(i)); + let w = weeks.as_ref().map_or(0, |a| a.value(i)); + let d = days.as_ref().map_or(0, |a| a.value(i)); + let h = hours.as_ref().map_or(0, |a| a.value(i)); + let mi = mins.as_ref().map_or(0, |a| a.value(i)); + let s = secs.as_ref().map_or(0.0, |a| a.value(i)); + + match make_interval_month_day_nano(y, mo, w, d, h, mi, s) { + Some(v) => builder.append_value(v), + None => { + builder.append_null(); + continue; + } + } + } + + Ok(Arc::new(builder.finish())) +} + +fn make_interval_month_day_nano( + year: i32, + month: i32, + week: i32, + day: i32, + hour: i32, + min: i32, + sec: f64, +) -> Option { + // checks if overflow + let months = year.checked_mul(12).and_then(|v| v.checked_add(month))?; + let total_days = week.checked_mul(7).and_then(|v| v.checked_add(day))?; + + let hours_nanos = (hour as i64).checked_mul(3_600_000_000_000)?; + let mins_nanos = (min as i64).checked_mul(60_000_000_000)?; + + let sec_int = sec.trunc() as i64; + let frac = sec - sec.trunc(); + let mut frac_nanos = (frac * 1_000_000_000.0).round() as i64; + + if frac_nanos.abs() >= 1_000_000_000 { + if frac_nanos > 0 { + frac_nanos -= 1_000_000_000; + } else { + frac_nanos += 1_000_000_000; + } + } + + let secs_nanos = sec_int.checked_mul(1_000_000_000)?; + + let total_nanos = hours_nanos + .checked_add(mins_nanos) + .and_then(|v| v.checked_add(secs_nanos)) + .and_then(|v| v.checked_add(frac_nanos))?; + + Some(IntervalMonthDayNano::new(months, total_days, total_nanos)) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Float64Array, Int32Array, IntervalMonthDayNanoArray}; + use arrow::datatypes::Field; + use datafusion_common::config::ConfigOptions; + use datafusion_common::Result; + + use super::*; + fn run_make_interval_month_day_nano(arrs: Vec) -> Result { + make_interval_kernel(&arrs) + } + + #[test] + fn nulls_propagate_per_row() { + let year = Arc::new(Int32Array::from(vec![ + None, + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ])); + let month = Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ])); + let week = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ])); + let day = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ])); + let hour = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + None, + Some(6), + Some(7), + Some(8), + Some(9), + ])); + let min = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + None, + Some(7), + Some(8), + Some(9), + ])); + let sec = Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + Some(3.0), + Some(4.0), + Some(5.0), + Some(6.0), + None, + Some(f64::INFINITY), + Some(f64::NEG_INFINITY), + ])); + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, min, sec, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected IntervalMonthDayNano".into()) + }) + .unwrap(); + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + } + + #[test] + fn error_months_overflow_should_be_null() { + // months = year*12 + month → NULL + let year = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, min, sec, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected IntervalMonthDayNano".into()) + }) + .unwrap(); + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + } + #[test] + fn error_days_overflow_should_be_null() { + // months = year*12 + month → NULL + let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, min, sec, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected IntervalMonthDayNano".into()) + }) + .unwrap(); + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + } + #[test] + fn error_min_overflow_should_be_null() { + let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let min = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef; + let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, min, sec, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected IntervalMonthDayNano".into()) + }) + .unwrap(); + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + } + #[test] + fn error_sec_overflow_should_be_null() { + let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef; + let sec = Arc::new(Float64Array::from(vec![Some(f64::MAX)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, min, sec, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected IntervalMonthDayNano".into()) + }) + .unwrap(); + + for i in 0..out.len() { + assert!(out.is_null(i), "row {i} should be NULL"); + } + } + + #[test] + fn happy_path_all_present_single_row() { + // 1y 2m 3w 4d 5h 6m 7.25s + let year = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(2)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(3)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(4)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(5)])) as ArrayRef; + let mins = Arc::new(Int32Array::from(vec![Some(6)])) as ArrayRef; + let secs = Arc::new(Float64Array::from(vec![Some(7.25)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, mins, secs, + ]) + .unwrap(); + assert_eq!(out.data_type(), &Interval(MonthDayNano)); + + let out = out + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(out.len(), 1); + assert_eq!(out.null_count(), 0); + + let v: IntervalMonthDayNano = out.value(0); + assert_eq!(v.months, 12 + 2); // 14 + assert_eq!(v.days, 3 * 7 + 4); // 25 + let expected_nanos = (5_i64 * 3600 + 6 * 60 + 7) * 1_000_000_000 + 250_000_000; + assert_eq!(v.nanoseconds, expected_nanos); + } + + #[test] + fn negative_components_and_fractional_seconds() { + // -1y -2m -1w -1d -1h -1m -1.5s + let year = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef; + let month = Arc::new(Int32Array::from(vec![Some(-2)])) as ArrayRef; + let week = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef; + let day = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef; + let hour = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef; + let mins = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef; + let secs = Arc::new(Float64Array::from(vec![Some(-1.5)])) as ArrayRef; + + let out = run_make_interval_month_day_nano(vec![ + year, month, week, day, hour, mins, secs, + ]) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(out.len(), 1); + assert_eq!(out.null_count(), 0); + let v = out.value(0); + + assert_eq!(v.months, -12 + (-2)); // -14 + assert_eq!(v.days, -7 + (-1)); // -8 + + // -(1h + 1m + 1.5s) en nanos + let expected_nanos = -((3600_i64 + 60 + 1) * 1_000_000_000 + 500_000_000); + assert_eq!(v.nanoseconds, expected_nanos); + } + + fn invoke_make_interval_with_args( + args: Vec, + number_rows: usize, + ) -> Result { + let arg_fields = args + .iter() + .map(|arg| Field::new("a", arg.data_type(), true).into()) + .collect::>(); + let args = ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", Interval(MonthDayNano), true).into(), + config_options: Arc::new(ConfigOptions::default()), + }; + SparkMakeInterval::new().invoke_with_args(args) + } + + #[test] + fn zero_args_returns_zero_seconds() -> Result<()> { + let number_rows = 2; + let res: ColumnarValue = invoke_make_interval_with_args(vec![], number_rows)?; + + match res { + ColumnarValue::Array(arr) => { + let arr = arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal( + "expected IntervalMonthDayNanoArray".into(), + ) + })?; + if arr.len() != number_rows { + return Err(DataFusionError::Internal(format!( + "expected array length {number_rows}, got {}", + arr.len() + ))); + } + for i in 0..number_rows { + let iv = arr.value(i); + if (iv.months, iv.days, iv.nanoseconds) != (0, 0, 0) { + return Err(DataFusionError::Internal(format!( + "row {i}: expected (0,0,0), got ({},{},{})", + iv.months, iv.days, iv.nanoseconds + ))); + } + } + } + ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(iv))) => { + if (iv.months, iv.days, iv.nanoseconds) != (0, 0, 0) { + return Err(DataFusionError::Internal(format!( + "expected scalar 0s, got ({},{},{})", + iv.months, iv.days, iv.nanoseconds + ))); + } + } + other => { + return Err(DataFusionError::Internal(format!( + "expected Array or Scalar IntervalMonthDayNano, got {other:?}" + ))); + } + } + + Ok(()) + } +} diff --git a/datafusion/spark/src/function/datetime/mod.rs b/datafusion/spark/src/function/datetime/mod.rs index 0e37284cc6..a6adc99607 100644 --- a/datafusion/spark/src/function/datetime/mod.rs +++ b/datafusion/spark/src/function/datetime/mod.rs @@ -18,6 +18,8 @@ pub mod date_add; pub mod date_sub; pub mod last_day; +pub mod make_dt_interval; +pub mod make_interval; pub mod next_day; use datafusion_expr::ScalarUDF; @@ -27,6 +29,8 @@ use std::sync::Arc; make_udf_function!(date_add::SparkDateAdd, date_add); make_udf_function!(date_sub::SparkDateSub, date_sub); make_udf_function!(last_day::SparkLastDay, last_day); +make_udf_function!(make_dt_interval::SparkMakeDtInterval, make_dt_interval); +make_udf_function!(make_interval::SparkMakeInterval, make_interval); make_udf_function!(next_day::SparkNextDay, next_day); pub mod expr_fn { @@ -47,6 +51,16 @@ pub mod expr_fn { "Returns the last day of the month which the date belongs to.", arg1 )); + export_functions!(( + make_dt_interval, + "Make a day time interval from given days, hours, mins and secs (return type is actually a Duration(Microsecond))", + days hours mins secs + )); + export_functions!(( + make_interval, + "Make interval from years, months, weeks, days, hours, mins and secs.", + years months weeks days hours mins secs + )); // TODO: add once ANSI support is added: // "When both of the input parameters are not NULL and day_of_week is an invalid input, the function throws SparkIllegalArgumentException if spark.sql.ansi.enabled is set to true, otherwise NULL." export_functions!(( @@ -57,5 +71,12 @@ pub mod expr_fn { } pub fn functions() -> Vec> { - vec![date_add(), date_sub(), last_day(), next_day()] + vec![ + date_add(), + date_sub(), + last_day(), + make_dt_interval(), + make_interval(), + next_day(), + ] } diff --git a/datafusion/spark/src/function/map/map_from_arrays.rs b/datafusion/spark/src/function/map/map_from_arrays.rs new file mode 100644 index 0000000000..987548e353 --- /dev/null +++ b/datafusion/spark/src/function/map/map_from_arrays.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use crate::function::map::utils::{ + get_element_type, get_list_offsets, get_list_values, + map_from_keys_values_offsets_nulls, map_type_from_key_value_types, +}; +use arrow::array::{Array, ArrayRef, NullArray}; +use arrow::compute::kernels::cast; +use arrow::datatypes::DataType; +use datafusion_common::utils::take_function_args; +use datafusion_common::Result; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_functions::utils::make_scalar_function; + +/// Spark-compatible `map_from_arrays` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct MapFromArrays { + signature: Signature, +} + +impl Default for MapFromArrays { + fn default() -> Self { + Self::new() + } +} + +impl MapFromArrays { + pub fn new() -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MapFromArrays { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "map_from_arrays" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let [key_type, value_type] = take_function_args("map_from_arrays", arg_types)?; + Ok(map_type_from_key_value_types( + get_element_type(key_type)?, + get_element_type(value_type)?, + )) + } + + fn invoke_with_args( + &self, + args: datafusion_expr::ScalarFunctionArgs, + ) -> Result { + make_scalar_function(map_from_arrays_inner, vec![])(&args.args) + } +} + +fn map_from_arrays_inner(args: &[ArrayRef]) -> Result { + let [keys, values] = take_function_args("map_from_arrays", args)?; + + if matches!(keys.data_type(), DataType::Null) + || matches!(values.data_type(), DataType::Null) + { + return Ok(cast( + &NullArray::new(keys.len()), + &map_type_from_key_value_types( + get_element_type(keys.data_type())?, + get_element_type(values.data_type())?, + ), + )?); + } + + map_from_keys_values_offsets_nulls( + get_list_values(keys)?, + get_list_values(values)?, + &get_list_offsets(keys)?, + &get_list_offsets(values)?, + keys.nulls(), + values.nulls(), + ) +} diff --git a/datafusion/spark/src/function/map/map_from_entries.rs b/datafusion/spark/src/function/map/map_from_entries.rs new file mode 100644 index 0000000000..6648979c5d --- /dev/null +++ b/datafusion/spark/src/function/map/map_from_entries.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use crate::function::map::utils::{ + get_element_type, get_list_offsets, get_list_values, + map_from_keys_values_offsets_nulls, map_type_from_key_value_types, +}; +use arrow::array::{Array, ArrayRef, NullBufferBuilder, StructArray}; +use arrow::buffer::NullBuffer; +use arrow::datatypes::DataType; +use datafusion_common::utils::take_function_args; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_functions::utils::make_scalar_function; + +/// Spark-compatible `map_from_entries` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct MapFromEntries { + signature: Signature, +} + +impl Default for MapFromEntries { + fn default() -> Self { + Self::new() + } +} + +impl MapFromEntries { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MapFromEntries { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "map_from_entries" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let [entries_type] = take_function_args("map_from_entries", arg_types)?; + let entries_element_type = get_element_type(entries_type)?; + let (keys_type, values_type) = match entries_element_type { + DataType::Struct(fields) if fields.len() == 2 => { + Ok((fields[0].data_type(), fields[1].data_type())) + } + wrong_type => exec_err!( + "map_from_entries: expected array>, got {:?}", + wrong_type + ), + }?; + Ok(map_type_from_key_value_types(keys_type, values_type)) + } + + fn invoke_with_args( + &self, + args: datafusion_expr::ScalarFunctionArgs, + ) -> Result { + make_scalar_function(map_from_entries_inner, vec![])(&args.args) + } +} + +fn map_from_entries_inner(args: &[ArrayRef]) -> Result { + let [entries] = take_function_args("map_from_entries", args)?; + let entries_offsets = get_list_offsets(entries)?; + let entries_values = get_list_values(entries)?; + + let (flat_keys, flat_values) = + match entries_values.as_any().downcast_ref::() { + Some(a) => Ok((a.column(0), a.column(1))), + None => exec_err!( + "map_from_entries: expected array>, got {:?}", + entries_values.data_type() + ), + }?; + + let entries_with_nulls = entries_values.nulls().and_then(|entries_inner_nulls| { + let mut builder = NullBufferBuilder::new_with_len(0); + let mut cur_offset = entries_offsets + .first() + .map(|offset| *offset as usize) + .unwrap_or(0); + + for next_offset in entries_offsets.iter().skip(1) { + let num_entries = *next_offset as usize - cur_offset; + builder.append( + entries_inner_nulls + .slice(cur_offset, num_entries) + .null_count() + == 0, + ); + cur_offset = *next_offset as usize; + } + builder.finish() + }); + + let res_nulls = NullBuffer::union(entries.nulls(), entries_with_nulls.as_ref()); + + map_from_keys_values_offsets_nulls( + flat_keys, + flat_values, + &entries_offsets, + &entries_offsets, + None, + res_nulls.as_ref(), + ) +} diff --git a/datafusion/spark/src/function/map/mod.rs b/datafusion/spark/src/function/map/mod.rs index a87df9a2c8..2f596b19b4 100644 --- a/datafusion/spark/src/function/map/mod.rs +++ b/datafusion/spark/src/function/map/mod.rs @@ -15,11 +15,33 @@ // specific language governing permissions and limitations // under the License. +pub mod map_from_arrays; +pub mod map_from_entries; +mod utils; + use datafusion_expr::ScalarUDF; +use datafusion_functions::make_udf_function; use std::sync::Arc; -pub mod expr_fn {} +make_udf_function!(map_from_arrays::MapFromArrays, map_from_arrays); +make_udf_function!(map_from_entries::MapFromEntries, map_from_entries); + +pub mod expr_fn { + use datafusion_functions::export_functions; + + export_functions!(( + map_from_arrays, + "Creates a map from arrays of keys and values.", + keys values + )); + + export_functions!(( + map_from_entries, + "Creates a map from array>.", + arg1 + )); +} pub fn functions() -> Vec> { - vec![] + vec![map_from_arrays(), map_from_entries()] } diff --git a/datafusion/spark/src/function/map/utils.rs b/datafusion/spark/src/function/map/utils.rs new file mode 100644 index 0000000000..b568f45403 --- /dev/null +++ b/datafusion/spark/src/function/map/utils.rs @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::borrow::Cow; +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, AsArray, BooleanBuilder, MapArray, StructArray}; +use arrow::buffer::{NullBuffer, OffsetBuffer}; +use arrow::compute::filter; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{exec_err, Result, ScalarValue}; + +/// Helper function to get element [`DataType`] +/// from [`List`](DataType::List)/[`LargeList`](DataType::LargeList)/[`FixedSizeList`](DataType::FixedSizeList)
+/// [`Null`](DataType::Null) can be coerced to `ListType`([`Null`](DataType::Null)), so [`Null`](DataType::Null) is returned
+/// For all other types [`exec_err`] is raised +pub fn get_element_type(data_type: &DataType) -> Result<&DataType> { + match data_type { + DataType::Null => Ok(data_type), + DataType::List(element) + | DataType::LargeList(element) + | DataType::FixedSizeList(element, _) => Ok(element.data_type()), + _ => exec_err!( + "get_element_type expects List/LargeList/FixedSizeList/Null as argument, got {data_type:?}" + ), + } +} + +/// Helper function to get [`values`](arrow::array::ListArray::values) +/// from [`ListArray`](arrow::array::ListArray)/[`LargeListArray`](arrow::array::LargeListArray)/[`FixedSizeListArray`](arrow::array::FixedSizeListArray)
+/// [`NullArray`](arrow::array::NullArray) can be coerced to `ListType`([`Null`](DataType::Null)), so [`NullArray`](arrow::array::NullArray) is returned
+/// For all other types [`exec_err`] is raised +pub fn get_list_values(array: &ArrayRef) -> Result<&ArrayRef> { + match array.data_type() { + DataType::Null => Ok(array), + DataType::List(_) => Ok(array.as_list::().values()), + DataType::LargeList(_) => Ok(array.as_list::().values()), + DataType::FixedSizeList(..) => Ok(array.as_fixed_size_list().values()), + wrong_type => exec_err!( + "get_list_values expects List/LargeList/FixedSizeList/Null as argument, got {wrong_type:?}" + ), + } +} + +/// Helper function to get [`offsets`](arrow::array::ListArray::offsets) +/// from [`ListArray`](arrow::array::ListArray)/[`LargeListArray`](arrow::array::LargeListArray)/[`FixedSizeListArray`](arrow::array::FixedSizeListArray)
+/// For all other types [`exec_err`] is raised +pub fn get_list_offsets(array: &ArrayRef) -> Result> { + match array.data_type() { + DataType::List(_) => Ok(Cow::Borrowed(array.as_list::().offsets().as_ref())), + DataType::LargeList(_) => Ok(Cow::Owned( + array.as_list::() + .offsets() + .iter() + .map(|i| *i as i32) + .collect::>(), + )), + DataType::FixedSizeList(_, size) => Ok(Cow::Owned( + (0..=array.len() as i32).map(|i| size * i).collect() + )), + wrong_type => exec_err!( + "get_list_offsets expects List/LargeList/FixedSizeList as argument, got {wrong_type:?}" + ), + } +} + +/// Helper function to construct [`MapType`](DataType::Map) given K and V DataTypes for keys and values +/// - Map keys are unsorted +/// - Map keys are non-nullable +/// - Map entries are non-nullable +/// - Map values can be null +pub fn map_type_from_key_value_types( + key_type: &DataType, + value_type: &DataType, +) -> DataType { + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + // the key must not be nullable + Field::new("key", key_type.clone(), false), + Field::new("value", value_type.clone(), true), + ])), + false, // the entry is not nullable + )), + false, // the keys are not sorted + ) +} + +/// Helper function to construct MapArray from flattened ListArrays and OffsetBuffer +/// +/// Logic is close to `datafusion_functions_nested::map::make_map_array_internal`
+/// But there are some core differences: +/// 1. Input arrays are not [`ListArrays`](arrow::array::ListArray) itself, but their flattened [`values`](arrow::array::ListArray::values)
+/// So the inputs can be [`ListArray`](`arrow::array::ListArray`)/[`LargeListArray`](`arrow::array::LargeListArray`)/[`FixedSizeListArray`](`arrow::array::FixedSizeListArray`)
+/// To preserve the row info, [`offsets`](arrow::array::ListArray::offsets) and [`nulls`](arrow::array::ListArray::nulls) for both keys and values need to be provided
+/// [`FixedSizeListArray`](`arrow::array::FixedSizeListArray`) has no `offsets`, so they can be generated as a cumulative sum of it's `Size` +/// 2. Spark provides [spark.sql.mapKeyDedupPolicy](https://github.com/apache/spark/blob/cf3a34e19dfcf70e2d679217ff1ba21302212472/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4961) +/// to handle duplicate keys
+/// For now, configurable functions are not supported by Datafusion
+/// So more permissive `LAST_WIN` option is used in this implementation (instead of `EXCEPTION`)
+/// `EXCEPTION` behaviour can still be achieved externally in cost of performance:
+/// `when(array_length(array_distinct(keys)) == array_length(keys), constructed_map)`
+/// `.otherwise(raise_error("duplicate keys occurred during map construction"))` +pub fn map_from_keys_values_offsets_nulls( + flat_keys: &ArrayRef, + flat_values: &ArrayRef, + keys_offsets: &[i32], + values_offsets: &[i32], + keys_nulls: Option<&NullBuffer>, + values_nulls: Option<&NullBuffer>, +) -> Result { + let (keys, values, offsets) = map_deduplicate_keys( + flat_keys, + flat_values, + keys_offsets, + values_offsets, + keys_nulls, + values_nulls, + )?; + let nulls = NullBuffer::union(keys_nulls, values_nulls); + + let fields = Fields::from(vec![ + Field::new("key", flat_keys.data_type().clone(), false), + Field::new("value", flat_values.data_type().clone(), true), + ]); + let entries = StructArray::try_new(fields.clone(), vec![keys, values], None)?; + let field = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + Ok(Arc::new(MapArray::try_new( + field, offsets, entries, nulls, false, + )?)) +} + +fn map_deduplicate_keys( + flat_keys: &ArrayRef, + flat_values: &ArrayRef, + keys_offsets: &[i32], + values_offsets: &[i32], + keys_nulls: Option<&NullBuffer>, + values_nulls: Option<&NullBuffer>, +) -> Result<(ArrayRef, ArrayRef, OffsetBuffer)> { + let offsets_len = keys_offsets.len(); + let mut new_offsets = Vec::with_capacity(offsets_len); + + let mut cur_keys_offset = keys_offsets + .first() + .map(|offset| *offset as usize) + .unwrap_or(0); + let mut cur_values_offset = values_offsets + .first() + .map(|offset| *offset as usize) + .unwrap_or(0); + + let mut new_last_offset = 0; + new_offsets.push(new_last_offset); + + let mut keys_mask_builder = BooleanBuilder::new(); + let mut values_mask_builder = BooleanBuilder::new(); + for (row_idx, (next_keys_offset, next_values_offset)) in keys_offsets + .iter() + .zip(values_offsets.iter()) + .skip(1) + .enumerate() + { + let num_keys_entries = *next_keys_offset as usize - cur_keys_offset; + let num_values_entries = *next_values_offset as usize - cur_values_offset; + + let mut keys_mask_one = [false].repeat(num_keys_entries); + let mut values_mask_one = [false].repeat(num_values_entries); + + let key_is_valid = keys_nulls.is_none_or(|buf| buf.is_valid(row_idx)); + let value_is_valid = values_nulls.is_none_or(|buf| buf.is_valid(row_idx)); + + if key_is_valid && value_is_valid { + if num_keys_entries != num_values_entries { + return exec_err!("map_deduplicate_keys: keys and values lists in the same row must have equal lengths"); + } else if num_keys_entries != 0 { + let mut seen_keys = HashSet::new(); + + for cur_entry_idx in (0..num_keys_entries).rev() { + let key = ScalarValue::try_from_array( + &flat_keys, + cur_keys_offset + cur_entry_idx, + )? + .compacted(); + if seen_keys.contains(&key) { + // TODO: implement configuration and logic for spark.sql.mapKeyDedupPolicy=EXCEPTION (this is default spark-config) + // exec_err!("invalid argument: duplicate keys in map") + // https://github.com/apache/spark/blob/cf3a34e19dfcf70e2d679217ff1ba21302212472/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4961 + } else { + // This code implements deduplication logic for spark.sql.mapKeyDedupPolicy=LAST_WIN (this is NOT default spark-config) + keys_mask_one[cur_entry_idx] = true; + values_mask_one[cur_entry_idx] = true; + seen_keys.insert(key); + new_last_offset += 1; + } + } + } + } else { + // the result entry is NULL + // both current row offsets are skipped + // keys or values in the current row are marked false in the masks + } + keys_mask_builder.append_array(&keys_mask_one.into()); + values_mask_builder.append_array(&values_mask_one.into()); + new_offsets.push(new_last_offset); + cur_keys_offset += num_keys_entries; + cur_values_offset += num_values_entries; + } + let keys_mask = keys_mask_builder.finish(); + let values_mask = values_mask_builder.finish(); + let needed_keys = filter(&flat_keys, &keys_mask)?; + let needed_values = filter(&flat_values, &values_mask)?; + let offsets = OffsetBuffer::new(new_offsets.into()); + Ok((needed_keys, needed_values, offsets)) +} diff --git a/datafusion/spark/src/function/math/factorial.rs b/datafusion/spark/src/function/math/factorial.rs index 3addc9700e..ebb489d4ce 100644 --- a/datafusion/spark/src/function/math/factorial.rs +++ b/datafusion/spark/src/function/math/factorial.rs @@ -111,7 +111,7 @@ pub fn spark_factorial(args: &[ColumnarValue]) -> Result { - exec_err!("`factorial` got an unexpected scalar type: {:?}", other) + exec_err!("`factorial` got an unexpected scalar type: {}", other) } ColumnarValue::Array(array) => match array.data_type() { Int32 => { @@ -122,7 +122,7 @@ pub fn spark_factorial(args: &[ColumnarValue]) -> Result { - exec_err!("`factorial` got an unexpected argument type: {:?}", other) + exec_err!("`factorial` got an unexpected argument type: {}", other) } }, } diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs index 0cdf4f3a9a..120a053bb4 100644 --- a/datafusion/spark/src/function/math/hex.rs +++ b/datafusion/spark/src/function/math/hex.rs @@ -272,7 +272,7 @@ pub fn compute_hex( .map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose()) .collect::>()?, _ => exec_err!( - "hex got an unexpected argument type: {:?}", + "hex got an unexpected argument type: {}", array.data_type() )?, }; @@ -287,10 +287,7 @@ pub fn compute_hex( Ok(ColumnarValue::Array(Arc::new(string_array_values))) } - _ => exec_err!( - "hex got an unexpected argument type: {:?}", - array.data_type() - ), + _ => exec_err!("hex got an unexpected argument type: {}", array.data_type()), }, _ => exec_err!("native hex does not support scalar values at this time"), } diff --git a/datafusion/spark/src/function/string/luhn_check.rs b/datafusion/spark/src/function/string/luhn_check.rs index 79b2a854f7..090b16e34b 100644 --- a/datafusion/spark/src/function/string/luhn_check.rs +++ b/datafusion/spark/src/function/string/luhn_check.rs @@ -149,5 +149,5 @@ fn luhn_check_impl(input: &str) -> bool { alt = !alt; } - digits_processed > 0 && sum % 10 == 0 + digits_processed > 0 && sum.is_multiple_of(10) } diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs index 531883a6c4..bec7d90062 100644 --- a/datafusion/spark/src/lib.rs +++ b/datafusion/spark/src/lib.rs @@ -53,6 +53,8 @@ //! # impl FunctionRegistry for SessionContext { //! # fn register_udf(&mut self, _udf: Arc) -> Result>> { Ok (None) } //! # fn udfs(&self) -> HashSet { unimplemented!() } +//! # fn udafs(&self) -> HashSet { unimplemented!() } +//! # fn udwfs(&self) -> HashSet { unimplemented!() } //! # fn udf(&self, _name: &str) -> Result> { unimplemented!() } //! # fn udaf(&self, name: &str) -> Result> {unimplemented!() } //! # fn udwf(&self, name: &str) -> Result> { unimplemented!() } diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 7bd775fb53..ea2cd6dfcc 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -50,6 +50,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } bigdecimal = { workspace = true } +chrono = { workspace = true } datafusion-common = { workspace = true, features = ["sql"] } datafusion-expr = { workspace = true, features = ["sql"] } indexmap = { workspace = true } @@ -63,9 +64,10 @@ ctor = { workspace = true } # please do not move these dependencies to the main dependencies section datafusion-functions = { workspace = true, default-features = true } datafusion-functions-aggregate = { workspace = true } -datafusion-functions-nested = { workspace = true } +datafusion-functions-nested = { workspace = true, features = ["sql"] } datafusion-functions-window = { workspace = true } env_logger = { workspace = true } insta = { workspace = true } +itertools = { workspace = true } paste = "^1.0" rstest = { workspace = true } diff --git a/datafusion/sql/README.md b/datafusion/sql/README.md index d5ef3114c1..d0e5e498e5 100644 --- a/datafusion/sql/README.md +++ b/datafusion/sql/README.md @@ -17,10 +17,10 @@ under the License. --> -# DataFusion SQL Query Planner +# Apache DataFusion SQL Query Planner This crate provides a general purpose SQL query planner that can parse SQL and translate queries into logical -plans. Although this crate is used by the [DataFusion][df] query engine, it was designed to be easily usable from any +plans. Although this crate is used by the [Apache DataFusion] query engine, it was designed to be easily usable from any project that requires a SQL query planner and does not make any assumptions about how the resulting logical plan will be translated to a physical plan. For example, there is no concept of row-based versus columnar execution in the logical plan. @@ -29,12 +29,12 @@ Note that the [`datafusion`] crate re-exports this module. If you are already using the [`datafusion`] crate in your project, there is no reason to use this crate directly in your project as well. -[df]: https://crates.io/crates/datafusion +[apache datafusion]: https://datafusion.apache.org/ [`datafusion`]: https://crates.io/crates/datafusion ## Example Usage -See the [examples](examples) directory for fully working examples. +See the [examples] directory for fully working examples. Here is an example of producing a logical plan from a SQL string. @@ -69,8 +69,8 @@ fn main() { ``` This is the logical plan that is produced from this example. Note that this is an **unoptimized** -logical plan. The [datafusion-optimizer](https://crates.io/crates/datafusion-optimizer) crate provides a query -optimizer that can be applied to plans produced by this crate. +logical plan. The [datafusion-optimizer] crate provides a query optimizer that can be applied to +plans produced by this crate. ``` Sort: state_tax DESC NULLS FIRST @@ -87,4 +87,5 @@ Sort: state_tax DESC NULLS FIRST TableScan: orders ``` -[df]: https://crates.io/crates/datafusion +[examples]: examples +[datafusion-optimizer]: https://crates.io/crates/datafusion-optimizer diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs index 3650aea9c3..aceec67676 100644 --- a/datafusion/sql/src/cte.rs +++ b/datafusion/sql/src/cte.rs @@ -19,7 +19,6 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use arrow::datatypes::Schema; use datafusion_common::{ not_impl_err, plan_err, tree_node::{TreeNode, TreeNodeRecursion}, @@ -135,10 +134,9 @@ impl SqlToRel<'_, S> { // ---------- Step 2: Create a temporary relation ------------------ // Step 2.1: Create a table source for the temporary relation - let work_table_source = self.context_provider.create_cte_work_table( - &cte_name, - Arc::new(Schema::from(static_plan.schema().as_ref())), - )?; + let work_table_source = self + .context_provider + .create_cte_work_table(&cte_name, Arc::clone(static_plan.schema().inner()))?; // Step 2.2: Create a temporary relation logical plan that will be used // as the input to the recursive term diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 7727428502..a61967ed69 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -270,7 +270,24 @@ impl SqlToRel<'_, S> { // User-defined function (UDF) should have precedence if let Some(fm) = self.context_provider.get_function_meta(&name) { let args = self.function_args_to_expr(args, schema, planner_context)?; - return Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fm, args))); + let inner = ScalarFunction::new_udf(fm, args); + + if name.eq_ignore_ascii_case(inner.name()) { + return Ok(Expr::ScalarFunction(inner)); + } else { + // If the function is called by an alias, a verbose string representation is created + // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias` + // to ensure the output column name matches the user's query. + let arg_names = inner + .args + .iter() + .map(|arg| arg.to_string()) + .collect::>() + .join(","); + let verbose_alias = format!("{name}({arg_names})"); + + return Ok(Expr::ScalarFunction(inner).alias(verbose_alias)); + } } // Build Unnest expression @@ -381,7 +398,7 @@ impl SqlToRel<'_, S> { distinct, } = window_expr; - let expr = Expr::from(WindowFunction { + let inner = WindowFunction { fun: func_def, params: expr::WindowFunctionParams { args, @@ -392,9 +409,25 @@ impl SqlToRel<'_, S> { null_treatment, distinct, }, - }); + }; - return Ok(expr); + if name.eq_ignore_ascii_case(inner.fun.name()) { + return Ok(Expr::WindowFunction(Box::new(inner))); + } else { + // If the function is called by an alias, a verbose string representation is created + // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias` + // to ensure the output column name matches the user's query. + let arg_names = inner + .params + .args + .iter() + .map(|arg| arg.to_string()) + .collect::>() + .join(","); + let verbose_alias = format!("{name}({arg_names})"); + + return Ok(Expr::WindowFunction(Box::new(inner)).alias(verbose_alias)); + } } } else { // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function @@ -472,14 +505,32 @@ impl SqlToRel<'_, S> { null_treatment, } = aggregate_expr; - return Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf( + let inner = expr::AggregateFunction::new_udf( func, args, distinct, filter, order_by, null_treatment, - ))); + ); + + if name.eq_ignore_ascii_case(inner.func.name()) { + return Ok(Expr::AggregateFunction(inner)); + } else { + // If the function is called by an alias, a verbose string representation is created + // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias` + // to ensure the output column name matches the user's query. + let arg_names = inner + .params + .args + .iter() + .map(|arg| arg.to_string()) + .collect::>() + .join(","); + let verbose_alias = format!("{name}({arg_names})"); + + return Ok(Expr::AggregateFunction(inner).alias(verbose_alias)); + } } } diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 2c673162ec..06418fa4ec 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -188,7 +188,9 @@ pub(crate) type LexOrdering = Vec; /// Syntax: /// /// ```text -/// CREATE EXTERNAL TABLE +/// CREATE +/// [ OR REPLACE ] +/// EXTERNAL TABLE /// [ IF NOT EXISTS ] /// [ () ] /// STORED AS @@ -221,6 +223,8 @@ pub struct CreateExternalTable { pub order_exprs: Vec, /// Option to not error if table already exists pub if_not_exists: bool, + /// Option to replace table content if table already exists + pub or_replace: bool, /// Whether the table is a temporary table pub temporary: bool, /// Infinite streams? @@ -724,11 +728,26 @@ impl<'a> DFParser<'a> { /// Parse a SQL `CREATE` statement handling `CREATE EXTERNAL TABLE` pub fn parse_create(&mut self) -> Result { - if self.parser.parse_keyword(Keyword::EXTERNAL) { - self.parse_create_external_table(false) - } else if self.parser.parse_keyword(Keyword::UNBOUNDED) { - self.parser.expect_keyword(Keyword::EXTERNAL)?; - self.parse_create_external_table(true) + // TODO: Change sql parser to take in `or_replace: bool` inside parse_create() + if self + .parser + .parse_keywords(&[Keyword::OR, Keyword::REPLACE, Keyword::EXTERNAL]) + { + self.parse_create_external_table(false, true) + } else if self.parser.parse_keywords(&[ + Keyword::OR, + Keyword::REPLACE, + Keyword::UNBOUNDED, + Keyword::EXTERNAL, + ]) { + self.parse_create_external_table(true, true) + } else if self.parser.parse_keyword(Keyword::EXTERNAL) { + self.parse_create_external_table(false, false) + } else if self + .parser + .parse_keywords(&[Keyword::UNBOUNDED, Keyword::EXTERNAL]) + { + self.parse_create_external_table(true, false) } else { Ok(Statement::Statement(Box::from(self.parser.parse_create()?))) } @@ -876,15 +895,22 @@ impl<'a> DFParser<'a> { fn parse_create_external_table( &mut self, unbounded: bool, + or_replace: bool, ) -> Result { let temporary = self .parser .parse_one_of_keywords(&[Keyword::TEMP, Keyword::TEMPORARY]) .is_some(); + self.parser.expect_keyword(Keyword::TABLE)?; let if_not_exists = self.parser .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]); + + if if_not_exists && or_replace { + return parser_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'"); + } + let table_name = self.parser.parse_object_name(true)?; let (mut columns, constraints) = self.parse_columns()?; @@ -1000,6 +1026,7 @@ impl<'a> DFParser<'a> { table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]), order_exprs: builder.order_exprs, if_not_exists, + or_replace, temporary, unbounded, options: builder.options.unwrap_or(Vec::new()), @@ -1108,6 +1135,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1125,6 +1153,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1143,6 +1172,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1161,6 +1191,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![( @@ -1182,6 +1213,7 @@ mod tests { table_partition_cols: vec!["p1".to_string(), "p2".to_string()], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1210,6 +1242,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![( @@ -1231,6 +1264,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1248,6 +1282,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1265,6 +1300,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1283,6 +1319,26 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: true, + or_replace: false, + temporary: false, + unbounded: false, + options: vec![], + constraints: vec![], + }); + expect_parse_ok(sql, expected)?; + + // positive case: or replace + let sql = + "CREATE OR REPLACE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet'"; + let expected = Statement::CreateExternalTable(CreateExternalTable { + name: name.clone(), + columns: vec![], + file_type: "PARQUET".to_string(), + location: "foo.parquet".into(), + table_partition_cols: vec![], + order_exprs: vec![], + if_not_exists: false, + or_replace: true, temporary: false, unbounded: false, options: vec![], @@ -1304,6 +1360,7 @@ mod tests { table_partition_cols: vec!["p1".to_string()], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1335,6 +1392,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![("k1".into(), Value::SingleQuotedString("v1".into()))], @@ -1353,6 +1411,7 @@ mod tests { table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![ @@ -1401,6 +1460,7 @@ mod tests { with_fill: None, }]], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1448,6 +1508,7 @@ mod tests { }, ]], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1488,6 +1549,7 @@ mod tests { with_fill: None, }]], if_not_exists: false, + or_replace: false, temporary: false, unbounded: false, options: vec![], @@ -1495,7 +1557,7 @@ mod tests { }); expect_parse_ok(sql, expected)?; - // Most complete CREATE EXTERNAL TABLE statement possible + // Most complete CREATE EXTERNAL TABLE statement possible (using IF NOT EXISTS) let sql = " CREATE UNBOUNDED EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float) STORED AS PARQUET @@ -1537,6 +1599,75 @@ mod tests { with_fill: None, }]], if_not_exists: true, + or_replace: false, + temporary: false, + unbounded: true, + options: vec![ + ( + "format.compression".into(), + Value::SingleQuotedString("zstd".into()), + ), + ( + "format.delimiter".into(), + Value::SingleQuotedString("*".into()), + ), + ( + "ROW_GROUP_SIZE".into(), + Value::SingleQuotedString("1024".into()), + ), + ("TRUNCATE".into(), Value::SingleQuotedString("NO".into())), + ( + "format.has_header".into(), + Value::SingleQuotedString("true".into()), + ), + ], + constraints: vec![], + }); + expect_parse_ok(sql, expected)?; + + // Most complete CREATE EXTERNAL TABLE statement possible (using OR REPLACE) + let sql = " + CREATE OR REPLACE UNBOUNDED EXTERNAL TABLE t (c1 int, c2 float) + STORED AS PARQUET + WITH ORDER (c1 - c2 ASC) + PARTITIONED BY (c1) + LOCATION 'foo.parquet' + OPTIONS ('format.compression' 'zstd', + 'format.delimiter' '*', + 'ROW_GROUP_SIZE' '1024', + 'TRUNCATE' 'NO', + 'format.has_header' 'true')"; + let expected = Statement::CreateExternalTable(CreateExternalTable { + name: name.clone(), + columns: vec![ + make_column_def("c1", DataType::Int(None)), + make_column_def("c2", DataType::Float(None)), + ], + file_type: "PARQUET".to_string(), + location: "foo.parquet".into(), + table_partition_cols: vec!["c1".into()], + order_exprs: vec![vec![OrderByExpr { + expr: Expr::BinaryOp { + left: Box::new(Identifier(Ident { + value: "c1".to_owned(), + quote_style: None, + span: Span::empty(), + })), + op: BinaryOperator::Minus, + right: Box::new(Identifier(Ident { + value: "c2".to_owned(), + quote_style: None, + span: Span::empty(), + })), + }, + options: OrderByOptions { + asc: Some(true), + nulls_first: None, + }, + with_fill: None, + }]], + if_not_exists: false, + or_replace: true, temporary: false, unbounded: true, options: vec![ diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index fc678a8f87..79e8cd8e12 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -708,7 +708,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(DataType::Time64(TimeUnit::Nanosecond)) } else { // We don't support TIMETZ and TIME WITH TIME ZONE for now - not_impl_err!("Unsupported SQL type {sql_type:?}") + not_impl_err!("Unsupported SQL type {sql_type}") } } SQLDataType::Numeric(exact_number_info) @@ -819,7 +819,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::TsVector | SQLDataType::TsQuery | SQLDataType::GeometricType(_) => { - not_impl_err!("Unsupported SQL type {sql_type:?}") + not_impl_err!("Unsupported SQL type {sql_type}") } } } diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs index c06147e08f..9f8b483b8f 100644 --- a/datafusion/sql/src/query.rs +++ b/datafusion/sql/src/query.rs @@ -21,14 +21,15 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::stack::StackGuard; use datafusion_common::{not_impl_err, Constraints, DFSchema, Result}; -use datafusion_expr::expr::Sort; +use datafusion_expr::expr::{Sort, WildcardOptions}; +use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ CreateMemoryTable, DdlStatement, Distinct, Expr, LogicalPlan, LogicalPlanBuilder, }; use sqlparser::ast::{ - Expr as SQLExpr, Ident, LimitClause, OrderBy, OrderByExpr, OrderByKind, Query, - SelectInto, SetExpr, + Expr as SQLExpr, Ident, LimitClause, Offset, OffsetRows, OrderBy, OrderByExpr, + OrderByKind, PipeOperator, Query, SelectInto, SetExpr, }; use sqlparser::tokenizer::Span; @@ -49,7 +50,7 @@ impl SqlToRel<'_, S> { } let set_expr = *query.body; - match set_expr { + let plan = match set_expr { SetExpr::Select(mut select) => { let select_into = select.into.take(); let plan = @@ -78,6 +79,75 @@ impl SqlToRel<'_, S> { let plan = self.order_by(plan, order_by_rex)?; self.limit(plan, query.limit_clause, planner_context) } + }?; + + self.pipe_operators(plan, query.pipe_operators, planner_context) + } + + /// Apply pipe operators to a plan + fn pipe_operators( + &self, + mut plan: LogicalPlan, + pipe_operators: Vec, + planner_context: &mut PlannerContext, + ) -> Result { + for pipe_operator in pipe_operators { + plan = self.pipe_operator(plan, pipe_operator, planner_context)?; + } + Ok(plan) + } + + /// Apply a pipe operator to a plan + fn pipe_operator( + &self, + plan: LogicalPlan, + pipe_operator: PipeOperator, + planner_context: &mut PlannerContext, + ) -> Result { + match pipe_operator { + PipeOperator::Where { expr } => { + self.plan_selection(Some(expr), plan, planner_context) + } + PipeOperator::OrderBy { exprs } => { + let sort_exprs = self.order_by_to_sort_expr( + exprs, + plan.schema(), + planner_context, + true, + None, + )?; + self.order_by(plan, sort_exprs) + } + PipeOperator::Limit { expr, offset } => self.limit( + plan, + Some(LimitClause::LimitOffset { + limit: Some(expr), + offset: offset.map(|offset| Offset { + value: offset, + rows: OffsetRows::None, + }), + limit_by: vec![], + }), + planner_context, + ), + PipeOperator::Select { exprs } => { + let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_)); + let select_exprs = + self.prepare_select_exprs(&plan, exprs, empty_from, planner_context)?; + self.project(plan, select_exprs) + } + PipeOperator::Extend { exprs } => { + let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_)); + let extend_exprs = + self.prepare_select_exprs(&plan, exprs, empty_from, planner_context)?; + let all_exprs = + std::iter::once(SelectExpr::Wildcard(WildcardOptions::default())) + .chain(extend_exprs) + .collect(); + self.project(plan, all_exprs) + } + + x => not_impl_err!("`{x}` pipe operator is not supported yet"), } } diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index aa37d74fd4..9dfa078701 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -57,7 +57,7 @@ impl SqlToRel<'_, S> { planner_context, ) } else { - plan_err!("Unsupported function argument type: {:?}", arg) + plan_err!("Unsupported function argument type: {}", arg) } }) .collect::>(); diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 26dbf45fbc..42013a76a8 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -585,7 +585,7 @@ impl SqlToRel<'_, S> { Ok((intermediate_plan, intermediate_select_exprs)) } - fn plan_selection( + pub(crate) fn plan_selection( &self, selection: Option, plan: LogicalPlan, @@ -666,7 +666,7 @@ impl SqlToRel<'_, S> { } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. - fn prepare_select_exprs( + pub(crate) fn prepare_select_exprs( &self, plan: &LogicalPlan, projection: Vec, @@ -826,7 +826,11 @@ impl SqlToRel<'_, S> { } /// Wrap a plan in a projection - fn project(&self, input: LogicalPlan, expr: Vec) -> Result { + pub(crate) fn project( + &self, + input: LogicalPlan, + expr: Vec, + ) -> Result { // convert to Expr for validate_schema_satisfies_exprs let exprs = expr .iter() diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 7f94fce7bd..44e9246142 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -519,7 +519,6 @@ impl SqlToRel<'_, S> { } } } - Statement::CreateView { or_replace, materialized, @@ -1504,6 +1503,7 @@ impl SqlToRel<'_, S> { unbounded, options, constraints, + or_replace, } = statement; // Merge inline constraints and existing constraints @@ -1552,6 +1552,7 @@ impl SqlToRel<'_, S> { file_type, table_partition_cols, if_not_exists, + or_replace, temporary, definition, order_exprs: ordered_exprs, diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index 0a5a03fcad..647ad68067 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -21,7 +21,9 @@ use super::{ utils::character_length_to_sql, utils::date_part_to_sql, utils::sqlite_date_trunc_to_sql, utils::sqlite_from_unixtime_to_sql, Unparser, }; +use arrow::array::timezone::Tz; use arrow::datatypes::TimeUnit; +use chrono::DateTime; use datafusion_common::Result; use datafusion_expr::Expr; use regex::Regex; @@ -204,6 +206,11 @@ pub trait Dialect: Send + Sync { fn col_alias_overrides(&self, _alias: &str) -> Result> { Ok(None) } + + /// Allows the dialect to override logic of formatting datetime with tz into string. + fn timestamp_with_tz_to_string(&self, dt: DateTime, _unit: TimeUnit) -> String { + dt.to_string() + } } /// `IntervalStyle` to use for unparsing @@ -401,6 +408,17 @@ impl Dialect for DuckDBDialect { Ok(None) } + + fn timestamp_with_tz_to_string(&self, dt: DateTime, unit: TimeUnit) -> String { + let format = match unit { + TimeUnit::Second => "%Y-%m-%d %H:%M:%S%:z", + TimeUnit::Millisecond => "%Y-%m-%d %H:%M:%S%.3f%:z", + TimeUnit::Microsecond => "%Y-%m-%d %H:%M:%S%.6f%:z", + TimeUnit::Nanosecond => "%Y-%m-%d %H:%M:%S%.9f%:z", + }; + + dt.format(format).to_string() + } } pub struct MySqlDialect {} diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 271dbe030a..8f5b9cef08 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -35,7 +35,9 @@ use arrow::array::{ }, ArrayRef, Date32Array, Date64Array, PrimitiveArray, }; -use arrow::datatypes::{DataType, Decimal128Type, Decimal256Type, DecimalType}; +use arrow::datatypes::{ + DataType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, DecimalType, +}; use arrow::util::display::array_value_to_string; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, Result, @@ -604,7 +606,7 @@ impl Unparser<'_> { } fn named_struct_to_sql(&self, args: &[Expr]) -> Result { - if args.len() % 2 != 0 { + if !args.len().is_multiple_of(2) { return internal_err!("named_struct must have an even number of arguments"); } @@ -1062,8 +1064,19 @@ impl Unparser<'_> { where i64: From, { + let time_unit = match T::DATA_TYPE { + DataType::Timestamp(unit, _) => unit, + _ => { + return Err(internal_datafusion_err!( + "Expected Timestamp, got {:?}", + T::DATA_TYPE + )) + } + }; + let ts = if let Some(tz) = tz { - v.to_array()? + let dt = v + .to_array()? .as_any() .downcast_ref::>() .ok_or(internal_datafusion_err!( @@ -1072,8 +1085,8 @@ impl Unparser<'_> { .value_as_datetime_with_tz(0, tz.parse()?) .ok_or(internal_datafusion_err!( "Unable to convert {v:?} to DateTime" - ))? - .to_string() + ))?; + self.dialect.timestamp_with_tz_to_string(dt, time_unit) } else { v.to_array()? .as_any() @@ -1088,16 +1101,6 @@ impl Unparser<'_> { .to_string() }; - let time_unit = match T::DATA_TYPE { - DataType::Timestamp(unit, _) => unit, - _ => { - return Err(internal_datafusion_err!( - "Expected Timestamp, got {:?}", - T::DATA_TYPE - )) - } - }; - Ok(ast::Expr::Cast { kind: ast::CastKind::Cast, expr: Box::new(ast::Expr::value(SingleQuotedString(ts))), @@ -1182,6 +1185,20 @@ impl Unparser<'_> { Ok(ast::Expr::value(ast::Value::Number(f_val, false))) } ScalarValue::Float64(None) => Ok(ast::Expr::value(ast::Value::Null)), + ScalarValue::Decimal32(Some(value), precision, scale) => { + Ok(ast::Expr::value(ast::Value::Number( + Decimal32Type::format_decimal(*value, *precision, *scale), + false, + ))) + } + ScalarValue::Decimal32(None, ..) => Ok(ast::Expr::value(ast::Value::Null)), + ScalarValue::Decimal64(Some(value), precision, scale) => { + Ok(ast::Expr::value(ast::Value::Number( + Decimal64Type::format_decimal(*value, *precision, *scale), + false, + ))) + } + ScalarValue::Decimal64(None, ..) => Ok(ast::Expr::value(ast::Value::Null)), ScalarValue::Decimal128(Some(value), precision, scale) => { Ok(ast::Expr::value(ast::Value::Number( Decimal128Type::format_decimal(*value, *precision, *scale), @@ -1658,7 +1675,7 @@ impl Unparser<'_> { fn arrow_dtype_to_ast_dtype(&self, data_type: &DataType) -> Result { match data_type { DataType::Null => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Boolean => Ok(ast::DataType::Bool), DataType::Int8 => Ok(ast::DataType::TinyInt(None)), @@ -1670,7 +1687,7 @@ impl Unparser<'_> { DataType::UInt32 => Ok(ast::DataType::IntegerUnsigned(None)), DataType::UInt64 => Ok(ast::DataType::BigIntUnsigned(None)), DataType::Float16 => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Float32 => Ok(ast::DataType::Float(None)), DataType::Float64 => Ok(self.dialect.float64_ast_dtype()), @@ -1680,59 +1697,55 @@ impl Unparser<'_> { DataType::Date32 => Ok(self.dialect.date32_cast_dtype()), DataType::Date64 => Ok(self.ast_type_for_date64_in_cast()), DataType::Time32(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Time64(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Duration(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Interval(_) => Ok(ast::DataType::Interval), DataType::Binary => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::FixedSizeBinary(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::LargeBinary => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::BinaryView => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Utf8 => Ok(self.dialect.utf8_cast_dtype()), DataType::LargeUtf8 => Ok(self.dialect.large_utf8_cast_dtype()), DataType::Utf8View => Ok(self.dialect.utf8_cast_dtype()), DataType::List(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::FixedSizeList(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::LargeList(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::ListView(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::LargeListView(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Struct(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Union(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), - DataType::Decimal32(_precision, _scale) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Decimal64(_precision, _scale) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Decimal128(precision, scale) + DataType::Decimal32(precision, scale) + | DataType::Decimal64(precision, scale) + | DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; let mut new_scale = *scale as u64; @@ -1746,10 +1759,10 @@ impl Unparser<'_> { )) } DataType::Map(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } DataType::RunEndEncoded(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + not_impl_err!("Unsupported DataType: conversion: {data_type}") } } } @@ -2179,6 +2192,20 @@ mod tests { (col("need-quoted").eq(lit(1)), r#"("need-quoted" = 1)"#), (col("need quoted").eq(lit(1)), r#"("need quoted" = 1)"#), // See test_interval_scalar_to_expr for interval literals + ( + (col("a") + col("b")).gt(Expr::Literal( + ScalarValue::Decimal32(Some(1123), 4, 3), + None, + )), + r#"((a + b) > 1.123)"#, + ), + ( + (col("a") + col("b")).gt(Expr::Literal( + ScalarValue::Decimal64(Some(1123), 4, 3), + None, + )), + r#"((a + b) > 1.123)"#, + ), ( (col("a") + col("b")).gt(Expr::Literal( ScalarValue::Decimal128(Some(100123), 28, 3), @@ -3193,4 +3220,81 @@ mod tests { Ok(()) } + + #[test] + fn test_timestamp_with_tz_format() -> Result<()> { + let default_dialect: Arc = + Arc::new(CustomDialectBuilder::new().build()); + + let duckdb_dialect: Arc = Arc::new(DuckDBDialect::new()); + + for (dialect, scalar, expected) in [ + ( + Arc::clone(&default_dialect), + ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())), + "CAST('2025-09-15 11:00:00 +00:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&default_dialect), + ScalarValue::TimestampMillisecond( + Some(1757934000123), + Some("+01:00".into()), + ), + "CAST('2025-09-15 12:00:00.123 +01:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&default_dialect), + ScalarValue::TimestampMicrosecond( + Some(1757934000123456), + Some("-01:00".into()), + ), + "CAST('2025-09-15 10:00:00.123456 -01:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&default_dialect), + ScalarValue::TimestampNanosecond( + Some(1757934000123456789), + Some("+00:00".into()), + ), + "CAST('2025-09-15 11:00:00.123456789 +00:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&duckdb_dialect), + ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())), + "CAST('2025-09-15 11:00:00+00:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&duckdb_dialect), + ScalarValue::TimestampMillisecond( + Some(1757934000123), + Some("+01:00".into()), + ), + "CAST('2025-09-15 12:00:00.123+01:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&duckdb_dialect), + ScalarValue::TimestampMicrosecond( + Some(1757934000123456), + Some("-01:00".into()), + ), + "CAST('2025-09-15 10:00:00.123456-01:00' AS TIMESTAMP)", + ), + ( + Arc::clone(&duckdb_dialect), + ScalarValue::TimestampNanosecond( + Some(1757934000123456789), + Some("+00:00".into()), + ), + "CAST('2025-09-15 11:00:00.123456789+00:00' AS TIMESTAMP)", + ), + ] { + let unparser = Unparser::new(dialect.as_ref()); + + let expr = Expr::Literal(scalar, None); + + let actual = format!("{}", unparser.expr_to_sql(&expr)?); + assert_eq!(actual, expected); + } + Ok(()) + } } diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs index b429853fe9..343a90af3e 100644 --- a/datafusion/sql/tests/cases/params.rs +++ b/datafusion/sql/tests/cases/params.rs @@ -20,6 +20,7 @@ use arrow::datatypes::DataType; use datafusion_common::{assert_contains, ParamValues, ScalarValue}; use datafusion_expr::{LogicalPlan, Prepare, Statement}; use insta::assert_snapshot; +use itertools::Itertools as _; use std::collections::HashMap; pub struct ParameterTest<'a> { @@ -54,7 +55,7 @@ fn generate_prepare_stmt_and_data_types(sql: &str) -> (LogicalPlan, String) { let plan = logical_plan(sql).unwrap(); let data_types = match &plan { LogicalPlan::Statement(Statement::Prepare(Prepare { data_types, .. })) => { - format!("{data_types:?}") + data_types.iter().join(", ").to_string() } _ => panic!("Expected a Prepare statement"), }; @@ -160,7 +161,7 @@ fn test_prepare_statement_to_plan_no_param() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32]"#); + assert_snapshot!(dt, @r#"Int32"#); /////////////////// // replace params with values @@ -188,7 +189,7 @@ fn test_prepare_statement_to_plan_no_param() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[]"#); + assert_snapshot!(dt, @r#""#); /////////////////// // replace params with values @@ -269,7 +270,7 @@ fn test_prepare_statement_to_plan_params_as_constants() { EmptyRelation: rows=1 "# ); - assert_snapshot!(dt, @r#"[Int32]"#); + assert_snapshot!(dt, @r#"Int32"#); /////////////////// // replace params with values @@ -294,7 +295,7 @@ fn test_prepare_statement_to_plan_params_as_constants() { EmptyRelation: rows=1 "# ); - assert_snapshot!(dt, @r#"[Int32]"#); + assert_snapshot!(dt, @r#"Int32"#); /////////////////// // replace params with values @@ -319,7 +320,7 @@ fn test_prepare_statement_to_plan_params_as_constants() { EmptyRelation: rows=1 "# ); - assert_snapshot!(dt, @r#"[Int32, Float64]"#); + assert_snapshot!(dt, @r#"Int32, Float64"#); /////////////////// // replace params with values @@ -686,7 +687,7 @@ fn test_prepare_statement_to_plan_one_param() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32]"#); + assert_snapshot!(dt, @r#"Int32"#); /////////////////// // replace params with values @@ -719,7 +720,7 @@ fn test_prepare_statement_to_plan_data_type() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Float64]"#); + assert_snapshot!(dt, @r#"Float64"#); /////////////////// // replace params with values still succeed and use Float64 @@ -752,7 +753,7 @@ fn test_prepare_statement_to_plan_multi_params() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"#); + assert_snapshot!(dt, @r#"Int32, Utf8View, Float64, Int32, Float64, Utf8View"#); /////////////////// // replace params with values @@ -797,7 +798,7 @@ fn test_prepare_statement_to_plan_having() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32, Float64, Float64, Float64]"#); + assert_snapshot!(dt, @r#"Int32, Float64, Float64, Float64"#); /////////////////// // replace params with values @@ -836,7 +837,7 @@ fn test_prepare_statement_to_plan_limit() { TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int64, Int64]"#); + assert_snapshot!(dt, @r#"Int64, Int64"#); // replace params with values let param_values = vec![ScalarValue::Int64(Some(10)), ScalarValue::Int64(Some(200))]; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 9e16ab297e..f66af28f43 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -4664,7 +4664,7 @@ fn test_custom_type_plan() -> Result<()> { let err = planner.statement_to_plan(ast.pop_front().unwrap()); assert_contains!( err.unwrap_err().to_string(), - "This feature is not implemented: Unsupported SQL type Datetime(None)" + "This feature is not implemented: Unsupported SQL type DATETIME" ); fn plan_sql(sql: &str) -> LogicalPlan { diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 8f63bf84ff..d02d5f9cb5 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -53,7 +53,7 @@ itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } postgres-protocol = { version = "0.6.7", optional = true } -postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true } +postgres-types = { version = "0.2.10", features = ["derive", "with-chrono-0_4"], optional = true } rust_decimal = { version = "1.38.0", features = ["tokio-pg"] } # When updating the following dependency verify that sqlite test file regeneration works correctly # by running the regenerate_sqlite_files.sh script. @@ -62,9 +62,9 @@ sqlparser = { workspace = true } tempfile = { workspace = true } testcontainers = { workspace = true, optional = true } testcontainers-modules = { workspace = true, features = ["postgres"], optional = true } -thiserror = "2.0.16" +thiserror = "2.0.17" tokio = { workspace = true } -tokio-postgres = { version = "0.7.12", optional = true } +tokio-postgres = { version = "0.7.14", optional = true } [features] avro = ["datafusion/avro"] diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 3fdb29c9d5..a389ae1ef6 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -17,23 +17,29 @@ under the License. --> -# DataFusion sqllogictest +# Apache DataFusion sqllogictest -[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. -This crate is a submodule of DataFusion that contains an implementation of [sqllogictest](https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki). +This crate is a submodule of DataFusion that contains an implementation of [sqllogictest]. -[df]: https://crates.io/crates/datafusion +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ +[sqllogictest]: https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki ## Overview -This crate uses [sqllogictest-rs](https://github.com/risinglightdb/sqllogictest-rs) to parse and run `.slt` files in the -[`test_files`](test_files) directory of this crate or the [`data/sqlite`](https://github.com/apache/datafusion-testing/tree/main/data/sqlite) -directory of the [datafusion-testing](https://github.com/apache/datafusion-testing) crate. +This crate uses [sqllogictest-rs] to parse and run `.slt` files in the [`test_files`] directory of +this crate or the [`data/sqlite`] directory of the [datafusion-testing] repository. + +[sqllogictest-rs]: https://github.com/risinglightdb/sqllogictest-rs +[`test_files`]: test_files +[`data/sqlite`]: https://github.com/apache/datafusion-testing/tree/main/data/sqlite +[datafusion-testing]: https://github.com/apache/datafusion-testing ## Testing setup -1. `rustup update stable` DataFusion uses the latest stable release of rust +1. `rustup update stable` DataFusion uses the latest stable release of Rust 2. `git submodule init` 3. `git submodule update --init --remote --recursive` diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 143e3ef1a8..b499401e55 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; use std::collections::HashMap; use std::fs::File; use std::io::Write; @@ -31,8 +32,13 @@ use arrow::record_batch::RecordBatch; use datafusion::catalog::{ CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session, }; -use datafusion::common::DataFusionError; -use datafusion::logical_expr::{create_udf, ColumnarValue, Expr, ScalarUDF, Volatility}; +use datafusion::common::{not_impl_err, DataFusionError, Result}; +use datafusion::functions::math::abs; +use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl}; +use datafusion::logical_expr::{ + create_udf, ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, + Signature, Volatility, +}; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; use datafusion::{ @@ -133,6 +139,10 @@ impl TestContext { info!("Registering table with union column"); register_union_table(test_ctx.session_ctx()) } + "async_udf.slt" => { + info!("Registering dummy async udf"); + register_async_abs_udf(test_ctx.session_ctx()) + } _ => { info!("Using default SessionContext"); } @@ -235,7 +245,7 @@ pub async fn register_temp_table(ctx: &SessionContext) { #[async_trait] impl TableProvider for TestTable { - fn as_any(&self) -> &dyn std::any::Any { + fn as_any(&self) -> &dyn Any { self } @@ -458,3 +468,48 @@ fn register_union_table(ctx: &SessionContext) { ctx.register_batch("union_table", batch).unwrap(); } + +fn register_async_abs_udf(ctx: &SessionContext) { + #[derive(Debug, PartialEq, Eq, Hash)] + struct AsyncAbs { + inner_abs: Arc, + } + impl AsyncAbs { + fn new() -> Self { + AsyncAbs { inner_abs: abs() } + } + } + impl ScalarUDFImpl for AsyncAbs { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "async_abs" + } + + fn signature(&self) -> &Signature { + self.inner_abs.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + self.inner_abs.return_type(arg_types) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + not_impl_err!("{} can only be called from async contexts", self.name()) + } + } + #[async_trait] + impl AsyncScalarUDFImpl for AsyncAbs { + async fn invoke_async_with_args( + &self, + args: ScalarFunctionArgs, + ) -> Result { + return self.inner_abs.invoke_with_args(args); + } + } + let async_abs = AsyncAbs::new(); + let udf = AsyncScalarUDF::new(Arc::new(async_abs)); + ctx.register_udf(udf.into_scalar_udf()); +} diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index caf8d637ec..9d6c7b11ad 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -32,10 +32,12 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c10 BIGINT UNSIGNED NOT NULL, c11 FLOAT NOT NULL, c12 DOUBLE NOT NULL, - c13 VARCHAR NOT NULL + c13 VARCHAR NOT NULL, + c14 DATE NOT NULL, + c15 TIMESTAMP NOT NULL, ) STORED AS CSV -LOCATION '../../testing/data/csv/aggregate_test_100.csv' +LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv' OPTIONS ('format.has_header' 'true'); statement ok @@ -148,10 +150,10 @@ SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_ statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Int16, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int16, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Float64, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 statement error DataFusion error: This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal @@ -555,7 +557,7 @@ SELECT corr(c2, c12) FROM aggregate_test_100 query R select corr(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq ---- -0 +NULL # all_nulls_query_correlation query R @@ -723,10 +725,6 @@ SELECT c2, var_samp(c12) FROM aggregate_test_100 WHERE c12 > 0.90 GROUP BY c2 OR 4 NULL 5 0.000269544643 -# Use PostgresSQL dialect -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - # csv_query_stddev_12 query IR SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.95) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 @@ -737,9 +735,30 @@ SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.95) FROM aggregate_test_100 GROUP 4 NULL 5 NULL -# Restore the default dialect statement ok -set datafusion.sql_parser.dialect = 'Generic'; +CREATE TABLE t ( + a DOUBLE, + b BIGINT, + c INT +) AS VALUES +(1.0, 10, -5), +(2.0, 20, -5), +(3.0, 20, 4); + +# https://github.com/apache/datafusion/issues/15291 +query III +WITH s AS ( + SELECT + COUNT(a) FILTER (WHERE (b * b) - 3600 <= b), + COUNT(a) FILTER (WHERE (b * b) - 3000 <= b AND (c >= 0)), + COUNT(a) FILTER (WHERE (b * b) - 3000 <= b AND (c >= 0) AND (c >= 0)) + FROM t +) SELECT * FROM s +---- +3 1 1 + +statement ok +DROP TABLE t # csv_query_stddev_13 query IR @@ -1307,12 +1326,24 @@ SELECT COUNT(2) FROM aggregate_test_100 # ---- # 100 99 +# csv_query_approx_count_literal_null +query I +SELECT approx_distinct(null) +---- +0 + # csv_query_approx_count_dupe_expr_aliased query II SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_100 ---- 100 100 +# csv_query_approx_count_date_timestamp +query IIIII +SELECT approx_distinct(c14) AS a, approx_distinct(c15) AS b, approx_distinct(arrow_cast(c15, 'Date64')), approx_distinct(arrow_cast(c15, 'Time32(Second)')) as c, approx_distinct(arrow_cast(c15, 'Time64(Nanosecond)')) AS d FROM aggregate_test_100 +---- +18 60 60 60 60 + ## This test executes the APPROX_PERCENTILE_CONT aggregation against the test ## data, asserting the estimated quantiles are ±5% their actual values. ## @@ -1526,6 +1557,19 @@ SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST( ---- NULL +# percentile_cont_with_weight_with_nulls +query I +SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v) +FROM (VALUES (1, 1), (2, 1), (3, 1), (4, NULL), (NULL, 1), (NULL, NULL)) as t (v, w); +---- +2 + +# percentile_cont_with_weight_nulls_only +query I +SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(1, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); +---- +NULL + # # percentile_cont edge cases # @@ -1821,6 +1865,29 @@ c 122 d 124 e 115 + +# using approx_percentile_cont on 2 columns with same signature +query TII +SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 5 73 +b 5 68 +c 5 122 +d 5 124 +e 5 115 + +# error is unique to this UDAF +query TRR +SELECT c1, avg(c2) AS c2, avg(c3) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 2.857142857143 -18.333333333333 +b 3.263157894737 -5.842105263158 +c 2.666666666667 -1.333333333333 +d 2.444444444444 25.444444444444 +e 3 40.333333333333 + + + query TI SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- @@ -2367,7 +2434,7 @@ drop table t; # test count with largeutf8 statement ok -create table t (c string) as values +create table t (c string) as values (arrow_cast('a', 'LargeUtf8')), (arrow_cast('b', 'LargeUtf8')), (arrow_cast(null, 'LargeUtf8')), @@ -2726,7 +2793,7 @@ CREATE OR REPLACE TABLE corr_single_row( query R SELECT corr(x, y) FROM corr_single_row; ---- -0 +NULL # correlation with all nulls statement ok @@ -4719,9 +4786,7 @@ statement ok create table t as select arrow_cast(column1, 'Date32') as date32, - -- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can use this - -- arrow_cast(column1, 'Date64') as date64, - arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64, + arrow_cast(column1, 'Date64') as date64, column2 as names, column3 as tag from t_source; @@ -5047,7 +5112,7 @@ statement ok create table t (c1 decimal(10, 0), c2 int) as values (null, null), (null, null), (null, null); query RTIT -select +select sum(c1), arrow_typeof(sum(c1)), sum(c2), arrow_typeof(sum(c2)) from t; @@ -5124,10 +5189,6 @@ select c2, count(DISTINCT cast(c1 AS DECIMAL(10, 2))) from d_table GROUP BY c2 O A 2 B 2 -# Use PostgresSQL dialect -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - # Creating the table statement ok CREATE TABLE test_table (c1 INT, c2 INT, c3 INT) @@ -5257,10 +5318,6 @@ select c3, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c4), sum(c4) fro 700.1 2 15.15 30.3 10.1 20.2 0 NULL NULL 1 10.1 10.1 10.1 10.1 0 NULL -# Restore the default dialect -statement ok -set datafusion.sql_parser.dialect = 'Generic'; - ## Multiple distinct aggregates and dictionaries statement ok create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')), (1, arrow_cast('bar', 'Dictionary(Int32, Utf8)')); @@ -5299,7 +5356,7 @@ drop table dict_null_test; # avg_duration statement ok -create table d as values +create table d as values (arrow_cast(1, 'Duration(Second)'), arrow_cast(2, 'Duration(Millisecond)'), arrow_cast(3, 'Duration(Microsecond)'), arrow_cast(4, 'Duration(Nanosecond)'), 1), (arrow_cast(11, 'Duration(Second)'), arrow_cast(22, 'Duration(Millisecond)'), arrow_cast(33, 'Duration(Microsecond)'), arrow_cast(44, 'Duration(Nanosecond)'), 1); @@ -5353,7 +5410,7 @@ FROM d WHERE column1 IS NOT NULL; # Centered average window function query I?? -SELECT column5, column1, avg(column1) OVER (ORDER BY column5 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as centered_avg +SELECT column5, column1, avg(column1) OVER (ORDER BY column5 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as centered_avg FROM d WHERE column1 IS NOT NULL; ---- 1 0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 6 secs @@ -5549,7 +5606,7 @@ physical_plan 08)--------------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4 09)----------------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)] 10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], file_type=csv, has_header=true +11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c3], file_type=csv, has_header=true # @@ -5574,7 +5631,7 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[5] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], file_type=csv, has_header=true +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c3], file_type=csv, has_header=true query I SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 order by c3 limit 5; @@ -5598,7 +5655,7 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true query II SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 order by c2, c3 limit 5 offset 4; @@ -5633,7 +5690,7 @@ physical_plan 10)------------------CoalesceBatchesExec: target_batch_size=8192 11)--------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20 12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true +13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true query I SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c3 order by c3 limit 4; @@ -5659,7 +5716,7 @@ physical_plan 04)------CoalescePartitionsExec 05)--------AggregateExec: mode=Partial, gby=[c2@1 as c2, c3@2 as c3], aggr=[max(aggregate_test_100.c1)] 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true +07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true # TODO(msirek): Extend checking in LimitedDistinctAggregation equal groupings to ignore the order of columns # in the group-by column lists, so the limit could be pushed to the lowest AggregateExec in this case @@ -5683,7 +5740,7 @@ physical_plan 08)--------------CoalescePartitionsExec 09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[] 10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true +11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true query II SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c3, c2 order by c3, c2 limit 3 offset 10; @@ -5707,7 +5764,7 @@ physical_plan 04)------CoalescePartitionsExec 05)--------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true +07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true query II SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; @@ -5734,7 +5791,7 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], file_type=csv, has_header=true +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c3], file_type=csv, has_header=true statement ok set datafusion.optimizer.enable_distinct_aggregation_soft_limit = true; @@ -5806,7 +5863,7 @@ NULL NULL 3 NULL 1 4 0 8 0 # regr_*() basic tests query RRIRRRRRR -select +select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), @@ -5821,7 +5878,7 @@ from (values (1,2), (2,4), (3,6)); 2 0 3 1 2 4 2 8 4 query RRIRRRRRR -select +select regr_slope(c12, c11), regr_intercept(c12, c11), regr_count(c12, c11), @@ -5839,7 +5896,7 @@ from aggregate_test_100; # regr_*() functions ignore NULLs query RRIRRRRRR -select +select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), @@ -5854,7 +5911,7 @@ from (values (1,NULL), (2,4), (3,6)); 2 0 2 1 2.5 5 0.5 2 1 query RRIRRRRRR -select +select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), @@ -5869,7 +5926,7 @@ from (values (1,NULL), (NULL,4), (3,6)); NULL NULL 1 NULL 3 6 0 0 0 query RRIRRRRRR -select +select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), @@ -5884,8 +5941,8 @@ from (values (1,NULL), (NULL,4), (NULL,NULL)); NULL NULL 0 NULL NULL NULL NULL NULL NULL query TRRIRRRRRR rowsort -select - column3, +select + column3, regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), @@ -5909,7 +5966,7 @@ statement ok set datafusion.execution.batch_size = 1; query RRIRRRRRR -select +select regr_slope(c12, c11), regr_intercept(c12, c11), regr_count(c12, c11), @@ -5927,7 +5984,7 @@ statement ok set datafusion.execution.batch_size = 2; query RRIRRRRRR -select +select regr_slope(c12, c11), regr_intercept(c12, c11), regr_count(c12, c11), @@ -5945,7 +6002,7 @@ statement ok set datafusion.execution.batch_size = 3; query RRIRRRRRR -select +select regr_slope(c12, c11), regr_intercept(c12, c11), regr_count(c12, c11), @@ -6185,7 +6242,7 @@ select string_agg(k, ',' order by when k = 'a' then 3 when k = 'b' then 0 when k = 'c' then 2 - when k = 'd' then 1 + when k = 'd' then 1 end) from t; ---- @@ -6197,7 +6254,7 @@ select string_agg(k, ',' order by when k = 'a' then 3 when k = 'b' then 0 when k = 'c' then 2 - when k = 'd' then 1 + when k = 'd' then 1 end desc) from t; ---- @@ -6208,7 +6265,7 @@ query TT explain select string_agg(k, ',' order by v) from t; ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v ASC NULLS LAST]]] +01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v ASC NULLS LAST]]] 02)--TableScan: t projection=[k, v] physical_plan 01)AggregateExec: mode=Single, gby=[], aggr=[string_agg(t.k,Utf8(",")) ORDER BY [t.v ASC NULLS LAST]] @@ -6224,7 +6281,7 @@ query TT explain select string_agg(k, ',' order by v desc) from t; ---- logical_plan -01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]] +01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]] 02)--TableScan: t projection=[k, v] physical_plan 01)AggregateExec: mode=Single, gby=[], aggr=[string_agg(t.k,Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]] @@ -6273,7 +6330,7 @@ CREATE TABLE float_table ( # Test string_agg with largeutf8 statement ok -create table string_agg_large_utf8 (c string) as values +create table string_agg_large_utf8 (c string) as values (arrow_cast('a', 'LargeUtf8')), (arrow_cast('b', 'LargeUtf8')), (arrow_cast('c', 'LargeUtf8')) @@ -6328,7 +6385,7 @@ select count(*) from (select count(*) a, count(*) b from (select 1)); # UTF8 string matters for string to &[u8] conversion, add it to prevent regression statement ok -create table distinct_count_string_table as values +create table distinct_count_string_table as values (1, 'a', 'longstringtest_a', '台灣'), (2, 'b', 'longstringtest_b1', '日本'), (2, 'b', 'longstringtest_b2', '中國'), @@ -6955,7 +7012,7 @@ physical_plan 03)----CoalescePartitionsExec 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(aggregate_test_100.c5)] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c5], file_type=csv, has_header=true statement count 0 drop table aggregate_test_100; @@ -7322,6 +7379,38 @@ SELECT a, median(b), arrow_typeof(median(b)) FROM group_median_all_nulls GROUP B group0 NULL Int32 group1 NULL Int32 +statement ok +create table t_decimal (c decimal(10, 4)) as values (100.00), (125.00), (175.00), (200.00), (200.00), (300.00), (null), (null); + +# Test avg_distinct for Decimal128 +query RT +select avg(distinct c), arrow_typeof(avg(distinct c)) from t_decimal; +---- +180 Decimal128(14, 8) + +statement ok +drop table t_decimal; + +# Test avg_distinct for Decimal256 +statement ok +create table t_decimal256 (c decimal(50, 2)) as values + (100.00), + (125.00), + (175.00), + (200.00), + (200.00), + (300.00), + (null), + (null); + +query RT +select avg(distinct c), arrow_typeof(avg(distinct c)) from t_decimal256; +---- +180 Decimal256(54, 6) + +statement ok +drop table t_decimal256; + query I with test AS (SELECT i as c1, i + 1 as c2 FROM generate_series(1, 10) t(i)) select count(*) from test WHERE 1 = 1; @@ -7444,55 +7533,65 @@ FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar); # distinct average statement ok -create table distinct_avg (a int, b double) as values - (3, null), - (2, null), - (5, 100.5), - (5, 1.0), - (5, 44.112), - (null, 1.0), - (5, 100.5), - (1, 4.09), - (5, 100.5), - (5, 100.5), - (4, null), - (null, null) +create table distinct_avg (a int, b double, c decimal(10, 4), d decimal(50, 2)) as values + (3, null, 100.2562, 90251.21), + (2, null, 100.2562, null), + (5, 100.5, null, 10000000.11), + (5, 1.0, 100.2563, -1.0), + (5, 44.112, -132.12, null), + (null, 1.0, 100.2562, 90251.21), + (5, 100.5, -100.2562, -10000000.11), + (1, 4.09, 4222.124, 0.0), + (5, 100.5, null, 10000000.11), + (5, 100.5, 1.1, 1.0), + (4, null, 4222.124, null), + (null, null, null, null) ; # Need two columns to ensure single_distinct_to_group_by rule doesn't kick in, so we know our actual avg(distinct) code is being tested -query RTRTRR +query RTRTRTRTRRRR select avg(distinct a), arrow_typeof(avg(distinct a)), avg(distinct b), arrow_typeof(avg(distinct b)), + avg(distinct c), + arrow_typeof(avg(distinct c)), + avg(distinct d), + arrow_typeof(avg(distinct d)), avg(a), - avg(b) + avg(b), + avg(c), + avg(d) from distinct_avg; ---- -3 Float64 37.4255 Float64 4 56.52525 +3 Float64 37.4255 Float64 698.56005 Decimal128(14, 8) 15041.868333 Decimal256(54, 6) 4 56.52525 957.11074444 1272562.81625 -query RR rowsort +query RRRR rowsort select avg(distinct a), - avg(distinct b) + avg(distinct b), + avg(distinct c), + avg(distinct d) from distinct_avg group by b; ---- -1 4.09 -3 NULL -5 1 -5 100.5 -5 44.112 +1 4.09 4222.124 0 +3 NULL 2161.1901 90251.21 +5 1 100.25625 45125.105 +5 100.5 -49.5781 0.333333 +5 44.112 -132.12 NULL -query RR +query RRRR select avg(distinct a), - avg(distinct b) + avg(distinct b), + avg(distinct c), + avg(distinct d) from distinct_avg -where a is null and b is null; +where a is null and b is null and c is null and d is null; ---- -NULL NULL +NULL NULL NULL NULL statement ok drop table distinct_avg; diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 8755918cd1..5dcb72b705 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -69,9 +69,6 @@ set datafusion.execution.target_partitions = 2; statement ok set datafusion.execution.batch_size = 1; -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - # Grouping by unique fields allows to check all accumulators query ITIIII SELECT c5, c1, @@ -420,10 +417,6 @@ c true false NULL d NULL false NULL e true false NULL -# Enabling PG dialect for filtered aggregates tests -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - # Test count with filter query III SELECT diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index e720491712..764488e00f 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1521,6 +1521,11 @@ select input, array_max(input) from (select make_array(d - 1, d, d + 1) input fr [29, 30, 31] 31 [NULL, NULL, NULL] NULL +query II +select array_max(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_max(arrow_cast(make_array(1), 'LargeList(Int64)')); +---- +3 1 + query II select array_max(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_max(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')); ---- @@ -1606,6 +1611,11 @@ select input, array_min(input) from (select make_array(d - 1, d, d + 1) input fr [29, 30, 31] 29 [NULL, NULL, NULL] NULL +query II +select array_min(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_min(arrow_cast(make_array(1), 'LargeList(Int64)')); +---- +1 1 + query II select array_min(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_min(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')); ---- @@ -1938,6 +1948,12 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, ---- [2, 3, 4] [h, e] +query ?? +select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2, 4), + array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 1, 2); +---- +[2, 3, 4] [h, e] + # array_slice scalar function #2 (with positive indexes; full array) query ?? select array_slice(make_array(1, 2, 3, 4, 5), 0, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5); @@ -1949,6 +1965,12 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0, ---- [1, 2, 3, 4, 5] [h, e, l, l, o] +query ?? +select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0, 6), + array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0, 5); +---- +[1, 2, 3, 4, 5] [h, e, l, l, o] + # array_slice scalar function #3 (with positive indexes; first index = second index) query ?? select array_slice(make_array(1, 2, 3, 4, 5), 4, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 3); @@ -2423,6 +2445,20 @@ select array_sort(make_array(1, 3, null, 5, NULL, -5)), array_sort(make_array(1, ---- [NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1] +query ??? +select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'LargeList(Int64)')), + array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'ASC'), + array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'desc', 'NULLS FIRST'); +---- +[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1] + +query ??? +select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'FixedSizeList(6, Int64)')), + array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'ASC'), + array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'desc', 'NULLS FIRST'); +---- +[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1] + query ? select array_sort(column1, 'DESC', 'NULLS LAST') from arrays_values; ---- @@ -2788,7 +2824,6 @@ select array_append(column1, arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3 # DuckDB: [4] # ClickHouse: Null -# Since they dont have the same result, we just follow Postgres, return error query ? select array_prepend(4, NULL); ---- @@ -3208,6 +3243,22 @@ select array_concat( ---- [1, 2, 3] +query ? +select array_concat( + arrow_cast(['1', '2'], 'LargeList(Utf8)'), + arrow_cast(['3'], 'LargeList(Utf8)') +); +---- +[1, 2, 3] + +query ? +select array_concat( + arrow_cast(['1', '2'], 'FixedSizeList(2, Utf8)'), + arrow_cast(['3'], 'FixedSizeList(1, Utf8)') +); +---- +[1, 2, 3] + # Concatenating string arrays query ? select array_concat( @@ -3518,6 +3569,11 @@ select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ---- 3 5 1 +query III +select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1); +---- +3 5 1 + # array_position scalar function #2 (with optional argument) query III select array_position(['h', 'e', 'l', 'l', 'o'], 'l', 4), array_position([1, 2, 5, 4, 5], 5, 4), array_position([1, 1, 1], 1, 2); @@ -3529,6 +3585,11 @@ select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ---- 4 5 2 +query III +select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1, 2); +---- +4 5 2 + # array_position scalar function #3 (element is list) query II select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]); @@ -3831,6 +3892,14 @@ select ---- [1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3] +query ??? +select + array_replace(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3), + array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0), + array_replace(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0); +---- +[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3] + # array_replace scalar function #2 (element is list) query ?? select @@ -3862,6 +3931,21 @@ select ---- [[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]] +query ?? +select + array_replace( + arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [4, 5, 6], + [1, 1, 1] + ), + array_replace( + arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [2, 3, 4], + [3, 1, 4] + ); +---- +[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]] + # list_replace scalar function #3 (function alias `list_replace`) query ??? select list_replace( @@ -4003,6 +4087,14 @@ select ---- [1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3] +query ??? +select + array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3, 2), + array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0, 2), + array_replace_n(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0, 3); +---- +[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3] + # array_replace_n scalar function #2 (element is list) query ?? select @@ -4038,6 +4130,23 @@ select ---- [[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]] +query ?? +select + array_replace_n( + arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [4, 5, 6], + [1, 1, 1], + 2 + ), + array_replace_n( + arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [2, 3, 4], + [3, 1, 4], + 2 + ); +---- +[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]] + # list_replace_n scalar function #3 (function alias `array_replace_n`) query ??? select @@ -4194,6 +4303,14 @@ select ---- [1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3] +query ??? +select + array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3), + array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0), + array_replace_all(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0); +---- +[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3] + # array_replace_all scalar function #2 (element is list) query ?? select @@ -4225,6 +4342,21 @@ select ---- [[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]] +query ?? +select + array_replace_all( + arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [4, 5, 6], + [1, 1, 1] + ), + array_replace_all( + arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), + [2, 3, 4], + [3, 1, 4] + ); +---- +[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]] + # list_replace_all scalar function #3 (function alias `array_replace_all`) query ??? select @@ -4490,6 +4622,16 @@ select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, ---- [1, 2, 3, 4, 5, 6] +query ? +select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)')); +---- +[1, 2, 3, 4, 5, 6] + +query ? +select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6], 'FixedSizeList(2, Int64)')); +---- +[1, 2, 3, 4, 5, 6] + # array_union scalar function #2 query ? select array_union([1, 2, 3, 4], [5, 6, 7, 8]); @@ -4689,6 +4831,11 @@ select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), ---- h,e,l,l,o 1-2-3-4-5 1|2|3 +query TTT +select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'FixedSizeList(3, Float64)'), '|'); +---- +h,e,l,l,o 1-2-3-4-5 1|2|3 + # array_to_string scalar function with nulls #2 query TTT select array_to_string(make_array('h', NULL, NULL, NULL, 'o'), ',', '-'), array_to_string(make_array(NULL, 2, NULL, 4, 5), '-', 'nil'), array_to_string(make_array(1.0, NULL, 3.0), '|', '0'); @@ -4700,6 +4847,16 @@ select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'Large ---- h,-,-,-,o nil-2-nil-4-5 1|0|3 +query TTT +select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'FixedSizeList(5, Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'FixedSizeList(5, Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'FixedSizeList(3, Float64)'), '|', '0'); +---- +h,-,-,-,o nil-2-nil-4-5 1|0|3 + +query T +select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-'); +---- +-,a,-,- + # array_to_string with columns #1 # For reference @@ -4965,6 +5122,12 @@ select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5 ---- [[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]] +query ?? +select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]), + array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]); +---- +[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]] + # list_remove scalar function #3 (function alias `array_remove`) query ??? select list_remove(make_array(1, 2, 2, 1, 1), 2), list_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l'); @@ -5088,12 +5251,38 @@ select array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), array_remove_n(make_arra ---- [1, 1, 1] [2.0, 2.0, 1.0] [h, e, o] +query ??? +select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int32)'), 2, 2), + array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float32)'), 1.0, 2), + array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l', 3); +---- +[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o] + +query ??? +select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int32)'), 2, 2), + array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float32)'), 1.0, 2), + array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l', 3); +---- +[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o] + # array_remove_n scalar function #2 (element is list) query ?? select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 2), array_remove_n(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 2); ---- [[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]] +query ?? +select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6], 2), + array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4], 2); +---- +[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]] + +query ?? +select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6], 2), + array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4], 2); +---- +[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]] + # list_remove_n scalar function #3 (function alias `array_remove_n`) query ??? select list_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), list_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), list_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3); @@ -5156,6 +5345,13 @@ select array_remove_all(make_array(1, 2, 2, 1, 1), 2), array_remove_all(make_arr ---- [1, 1, 1] [2.0, 2.0] [h, e, o] +query ??? +select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2), + array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0), + array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l'); +---- +[1, 1, 1] [2.0, 2.0] [h, e, o] + query ??? select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2), array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0), array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l'); ---- @@ -5173,6 +5369,12 @@ select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [ ---- [[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]] +query ?? +select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]), + array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]); +---- +[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]] + # list_remove_all scalar function #3 (function alias `array_remove_all`) query ??? select list_remove_all(make_array(1, 2, 2, 1, 1), 2), list_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l'); @@ -6268,14 +6470,12 @@ physical_plan 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] -# FIXME: due to rewrite below not working, this is _extremely_ slow to evaluate -# query I -# with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) -# select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); -# ---- -# 1 +query I +with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) +select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); +---- +1 -# FIXME: array_has with large list haystack not currently rewritten to InList query TT explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); @@ -6286,7 +6486,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32))) +06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: generate_series() projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6295,7 +6495,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8View)), 1, 32)) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6608,6 +6808,17 @@ SELECT array_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow ---- [2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] +query ?????? +SELECT array_intersect(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,3,4), 'FixedSizeList(3, Int64)')), + array_intersect(arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)')), + array_intersect(arrow_cast(make_array('aa','bb','cc'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'FixedSizeList(3, Utf8)')), + array_intersect(arrow_cast(make_array(true, false), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true), 'FixedSizeList(1, Boolean)')), + array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'FixedSizeList(3, Float64)')), + array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'FixedSizeList(3, List(Int64))')) +; +---- +[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] + query ? select array_intersect([], []); ---- @@ -7285,6 +7496,16 @@ select array_except(null, null) ---- NULL +query ? +select array_except(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)')); +---- +[1, 2] + +query ? +select array_except(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)')); +---- +[1, 2] + ### Array operators tests @@ -7848,6 +8069,11 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1); ---- [1] +query ? +select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1); +---- +[1] + # array_resize scalar function #2 query ? select array_resize(make_array(1, 2, 3), 5); @@ -7859,6 +8085,11 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5); ---- [1, 2, 3, NULL, NULL] +query ? +select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 5); +---- +[1, 2, 3, NULL, NULL] + # array_resize scalar function #3 query ? select array_resize(make_array(1, 2, 3), 5, 4); @@ -8002,6 +8233,23 @@ NULL NULL [60, 59, 58, 57, 56, 55, 54, NULL, 52, 51] [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60] [70, 69, 68, 67, 66, 65, 64, 63, 62, 61] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] +statement ok +CREATE TABLE test_reverse_fixed_size AS VALUES + (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')), + (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')), + (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')), + (NULL); + +query ? +SELECT array_reverse(column1) FROM test_reverse_fixed_size; +---- +[3, 2, 1] +[6, 5, 4] +[9, 8, NULL] +NULL + +statement ok +DROP TABLE test_reverse_fixed_size; # Test defining a table with array columns statement ok diff --git a/datafusion/sqllogictest/test_files/async_udf.slt b/datafusion/sqllogictest/test_files/async_udf.slt new file mode 100644 index 0000000000..c61d02cfec --- /dev/null +++ b/datafusion/sqllogictest/test_files/async_udf.slt @@ -0,0 +1,107 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +create table data(x int) as values (-10), (2); + +# Async udf can be used in aggregation +query I +select min(async_abs(x)) from data; +---- +2 + +query TT +explain select min(async_abs(x)) from data; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[min(async_abs(data.x))]] +02)--TableScan: data projection=[x] +physical_plan +01)AggregateExec: mode=Final, gby=[], aggr=[min(async_abs(data.x))] +02)--CoalescePartitionsExec +03)----AggregateExec: mode=Partial, gby=[], aggr=[min(async_abs(data.x))] +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +05)--------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------DataSourceExec: partitions=1, partition_sizes=[1] + +# Async udf can be used in aggregation with group by +query I rowsort +select min(async_abs(x)) from data group by async_abs(x); +---- +10 +2 + +query TT +explain select min(async_abs(x)) from data group by async_abs(x); +---- +logical_plan +01)Projection: min(async_abs(data.x)) +02)--Aggregate: groupBy=[[__common_expr_1 AS async_abs(data.x)]], aggr=[[min(__common_expr_1 AS async_abs(data.x))]] +03)----Projection: async_abs(data.x) AS __common_expr_1 +04)------TableScan: data projection=[x] +physical_plan +01)ProjectionExec: expr=[min(async_abs(data.x))@1 as min(async_abs(data.x))] +02)--AggregateExec: mode=FinalPartitioned, gby=[async_abs(data.x)@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([async_abs(data.x)@0], 4), input_partitions=4 +05)--------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------ProjectionExec: expr=[__async_fn_0@1 as __common_expr_1] +08)--------------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))] +09)----------------CoalesceBatchesExec: target_batch_size=8192 +10)------------------DataSourceExec: partitions=1, partition_sizes=[1] + +# Async udf can be used in filter +query I +select * from data where async_abs(x) < 5; +---- +2 + +query TT +explain select * from data where async_abs(x) < 5; +---- +logical_plan +01)Filter: async_abs(data.x) < Int32(5) +02)--TableScan: data projection=[x] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: __async_fn_0@1 < 5, projection=[x@0] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------DataSourceExec: partitions=1, partition_sizes=[1] + +# Async udf can be used in projection +query I rowsort +select async_abs(x) from data; +---- +10 +2 + +query TT +explain select async_abs(x) from data; +---- +logical_plan +01)Projection: async_abs(data.x) +02)--TableScan: data projection=[x] +physical_plan +01)ProjectionExec: expr=[__async_fn_0@1 as async_abs(data.x)] +02)--AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 03cb5edb5f..1e6183f48b 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -297,3 +297,9 @@ CREATE EXTERNAL TABLE staging.foo STORED AS parquet LOCATION '../../parquet-test # Create external table with qualified name, but no schema should error statement error DataFusion error: Error during planning: failed to resolve schema: release CREATE EXTERNAL TABLE release.bar STORED AS parquet LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; + +# Cannot create external table alongside `if_not_exists` and `or_replace` +statement error DataFusion error: SQL error: ParserError\("'IF NOT EXISTS' cannot coexist with 'REPLACE'"\) +CREATE OR REPLACE EXTERNAL TABLE IF NOT EXISTS t_conflict(c1 int) +STORED AS CSV +LOCATION 'foo.csv'; diff --git a/datafusion/sqllogictest/test_files/create_function.slt b/datafusion/sqllogictest/test_files/create_function.slt index 4f0c53c36c..4e82c0866e 100644 --- a/datafusion/sqllogictest/test_files/create_function.slt +++ b/datafusion/sqllogictest/test_files/create_function.slt @@ -21,11 +21,6 @@ ## Note that DataFusion provides a pluggable system for creating functions ## but has no built in support for doing so. -# Use PostgresSQL dialect (until we upgrade to sqlparser 0.44, where CREATE FUNCTION) -# is supported in the Generic dialect (the default) -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - # Create function will fail unless a user supplied function factory is supplied statement error DataFusion error: Invalid or Unsupported Configuration: Function factory has not been configured CREATE FUNCTION foo (DOUBLE) RETURNS DOUBLE RETURN $1 + $2; diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 81f2955eff..f755ab3f35 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -272,7 +272,7 @@ drop table my_table # select_into statement ok -SELECT* INTO my_table FROM (SELECT * FROM aggregate_simple) +SELECT * INTO my_table FROM (SELECT * FROM aggregate_simple) query RRB rowsort SELECT * FROM my_table order by c1 LIMIT 1 @@ -587,7 +587,7 @@ statement ok CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # Should not recreate the same EXTERNAL table -statement error Execution error: Table 'aggregate_simple' already exists +statement error Execution error: External table 'aggregate_simple' already exists CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); statement ok @@ -607,6 +607,55 @@ CREATE TABLE table_without_values(field1 BIGINT, field2 BIGINT); statement error Execution error: 'IF NOT EXISTS' cannot coexist with 'REPLACE' CREATE OR REPLACE TABLE IF NOT EXISTS table_without_values(field1 BIGINT, field2 BIGINT); +# CREATE OR REPLACE +statement ok +CREATE OR REPLACE EXTERNAL TABLE aggregate_simple_repl +STORED AS CSV +LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); + +statement ok +CREATE OR REPLACE EXTERNAL TABLE aggregate_simple_repl +STORED AS CSV +LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); + +# Create replacement table for table that doesn't already exist +statement ok +DROP TABLE IF EXISTS aggregate_table; + +statement ok +CREATE OR REPLACE EXTERNAL TABLE aggregate_table +STORED AS CSV +LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); + +query TTT +DESCRIBE aggregate_table; +---- +c1 Float64 YES +c2 Float64 YES +c3 Boolean YES + +# Create replacement table with different format for table that doesn't already exist +query I +COPY (SELECT * FROM (VALUES (1),(2),(3)) AS t(id)) +TO 'test_files/scratch/ddl/test_table' +STORED AS PARQUET; +---- +3 + +statement ok +CREATE OR REPLACE EXTERNAL TABLE aggregate_table +STORED AS PARQUET +LOCATION 'test_files/scratch/ddl/test_table'; + + +query TTT +DESCRIBE aggregate_table; +---- +id Int64 YES + # Should insert into an empty table statement ok insert into table_without_values values (1, 2), (2, 3), (2, 4); diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index 6b80f56bcf..3e60423df8 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -120,7 +120,7 @@ from aggregate_test_100 order by c9 # WindowFunction wrong signature -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'nth_value' function: coercion from \[Int32, Int64, Int64\] to the signature OneOf\(\[Any\(0\), Any\(1\), Any\(2\)\]\) failed +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'nth_value' function: coercion from Int32, Int64, Int64 to the signature OneOf\(\[Any\(0\), Any\(1\), Any\(2\)\]\) failed select c9, nth_value(c5, 2, 3) over (order by c9) as nv1 diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 0df361a75b..7d70a892af 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -392,9 +392,9 @@ physical_plan 44)-----------------------------│ -------------------- ││ -------------------- │ 45)-----------------------------│ files: 1 ││ partition_count(in->out): │ 46)-----------------------------│ format: parquet ││ 1 -> 4 │ -47)-----------------------------│ predicate: true ││ │ -48)-----------------------------│ ││ partitioning_scheme: │ -49)-----------------------------│ ││ RoundRobinBatch(4) │ +47)-----------------------------│ ││ │ +48)-----------------------------│ predicate: ││ partitioning_scheme: │ +49)-----------------------------│ DynamicFilter [ empty ] ││ RoundRobinBatch(4) │ 50)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘ 51)----------------------------------------------------------┌─────────────┴─────────────┐ 52)----------------------------------------------------------│ DataSourceExec │ @@ -1314,7 +1314,7 @@ physical_plan 11)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 12)│ DataSourceExec ││ DataSourceExec │ 13)│ -------------------- ││ -------------------- │ -14)│ bytes: 6040 ││ bytes: 6040 │ +14)│ bytes: 5932 ││ bytes: 5932 │ 15)│ format: memory ││ format: memory │ 16)│ rows: 1 ││ rows: 1 │ 17)└───────────────────────────┘└───────────────────────────┘ @@ -1798,7 +1798,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ @@ -1821,7 +1821,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ @@ -1844,7 +1844,7 @@ physical_plan 11)┌─────────────┴─────────────┐ 12)│ DataSourceExec │ 13)│ -------------------- │ -14)│ bytes: 2672 │ +14)│ bytes: 2576 │ 15)│ format: memory │ 16)│ rows: 1 │ 17)└───────────────────────────┘ diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index eeea3cd39a..87345b833e 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -2079,9 +2079,6 @@ host1 1.1 101 host2 2.2 202 host3 3.3 303 -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - statement ok create table t (a float) as values (1), (2), (3); @@ -2101,9 +2098,6 @@ physical_plan statement ok drop table t; -statement ok -set datafusion.sql_parser.dialect = 'Generic'; - # test between expression with null query I select 1 where null between null and null; diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 1b5ea3df2c..b72f73d446 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4475,10 +4475,6 @@ physical_plan 12)----------------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4], file_type=csv, has_header=true -# Use PostgreSQL dialect -statement ok -set datafusion.sql_parser.dialect = 'Postgres'; - query II SELECT c2, count(distinct c3) FILTER (WHERE c1 != 'a') FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; ---- @@ -4497,10 +4493,6 @@ SELECT c2, count(distinct c3) FILTER (WHERE c1 != 'a'), count(c5) FILTER (WHERE 4 19 18 5 11 9 -# Restore the default dialect -statement ok -set datafusion.sql_parser.dialect = 'Generic'; - statement ok drop table aggregate_test_100; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 361bc97a17..670992633b 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -242,6 +242,7 @@ datafusion.execution.parquet.dictionary_enabled true datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL +datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 @@ -357,6 +358,7 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. diff --git a/datafusion/sqllogictest/test_files/join.slt.part b/datafusion/sqllogictest/test_files/join.slt.part index c34f168817..2abe654a96 100644 --- a/datafusion/sqllogictest/test_files/join.slt.part +++ b/datafusion/sqllogictest/test_files/join.slt.part @@ -1503,3 +1503,13 @@ drop table t1; statement ok drop table t0; + +# SQLancer fuzzed query (https://github.com/apache/datafusion/issues/14015) +statement ok +create table t1(v1 int, v2 int); + +query error DataFusion error: Schema error: No field named tt1.v2. Valid fields are tt1.v1. +select v1 from t1 as tt1 natural join t1 as tt2 group by v1 order by v2; + +statement ok +drop table t1; diff --git a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt new file mode 100644 index 0000000000..0336cfc2d3 --- /dev/null +++ b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt @@ -0,0 +1,321 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Test IS NOT DISTINCT FROM join functionality +# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins +# to equijoins with proper null equality handling + +statement ok +CREATE TABLE t0 ( + id INT, + val INT +) + +statement ok +CREATE TABLE t1 ( + id INT, + val INT +) + +statement ok +CREATE TABLE t2 ( + id INT, + val INT +) + +statement ok +INSERT INTO t0 VALUES +(1, 10), +(2, NULL), +(5, 50) + +statement ok +INSERT INTO t1 VALUES +(1, 10), +(2, NULL), +(3, 30), +(6, NULL) + +statement ok +INSERT INTO t2 VALUES +(1, 10), +(2, NULL), +(4, 40), +(6, 6) + +# Test basic IS NOT DISTINCT FROM join functionality +query IIII rowsort +SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val +---- +1 1 10 10 +2 2 NULL NULL +6 2 NULL NULL + +# Test that IS NOT DISTINCT FROM join produces HashJoin when used alone +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val +---- +logical_plan +01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +02)--Inner Join: t1.val = t2.val +03)----TableScan: t1 projection=[id, val] +04)----TableScan: t2 projection=[id, val] +physical_plan +01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +statement ok +set datafusion.explain.format = "tree"; + +# Tree explain should highlight null equality semantics +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val +---- +physical_plan +01)┌───────────────────────────┐ +02)│ ProjectionExec │ +03)│ -------------------- │ +04)│ t1_id: id │ +05)│ t2_id: id │ +06)│ val: val │ +07)└─────────────┬─────────────┘ +08)┌─────────────┴─────────────┐ +09)│ CoalesceBatchesExec │ +10)│ -------------------- │ +11)│ target_batch_size: │ +12)│ 8192 │ +13)└─────────────┬─────────────┘ +14)┌─────────────┴─────────────┐ +15)│ HashJoinExec │ +16)│ -------------------- │ +17)│ NullsEqual: true ├──────────────┐ +18)│ │ │ +19)│ on: (val = val) │ │ +20)└─────────────┬─────────────┘ │ +21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +22)│ DataSourceExec ││ DataSourceExec │ +23)│ -------------------- ││ -------------------- │ +24)│ bytes: 288 ││ bytes: 288 │ +25)│ format: memory ││ format: memory │ +26)│ rows: 1 ││ rows: 1 │ +27)└───────────────────────────┘└───────────────────────────┘ + +statement ok +set datafusion.explain.format = "indent"; + +# For nested expression comparision, it should still able to be converted to Hash Join +query IIII rowsort +SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS NOT DISTINCT FROM 11); +---- +1 1 10 10 + +# The plan should include HashJoin +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS NOT DISTINCT FROM 11); +---- +logical_plan +01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + Int64(1) +03)----Filter: CAST(t1.val AS Int64) + Int64(1) IS NOT DISTINCT FROM Int64(11) +04)------TableScan: t1 projection=[id, val] +05)----TableScan: t2 projection=[id, val] +physical_plan +01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4], NullsEqual: true +04)------CoalescePartitionsExec +05)--------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------CoalesceBatchesExec: target_batch_size=8192 +08)--------------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11 +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] +10)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)] +11)--------DataSourceExec: partitions=1, partition_sizes=[1] + +# Mixed join predicate with `IS DISTINCT FROM` and `IS NOT DISTINCT FROM` +query IIII rowsort +SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS DISTINCT FROM (t2.val % 3)); +---- + +# The plan should include HashJoin +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS DISTINCT FROM (t2.val % 3)); +---- +logical_plan +01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + Int64(1) Filter: CAST(t1.val AS Int64) % Int64(3) IS DISTINCT FROM CAST(t2.val AS Int64) % Int64(3) +03)----TableScan: t1 projection=[id, val] +04)----TableScan: t2 projection=[id, val] +physical_plan +01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4], NullsEqual: true +04)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)] +05)--------DataSourceExec: partitions=1, partition_sizes=[1] +06)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)] +07)--------DataSourceExec: partitions=1, partition_sizes=[1] + +# Test mixed equal and IS NOT DISTINCT FROM conditions +# The `IS NOT DISTINCT FROM` expr should NOT in HashJoin's `on` predicate +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.id = t2.id AND t1.val IS NOT DISTINCT FROM t2.val +---- +logical_plan +01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +02)--Inner Join: t1.id = t2.id Filter: t1.val IS NOT DISTINCT FROM t2.val +03)----TableScan: t1 projection=[id, val] +04)----TableScan: t2 projection=[id, val] +physical_plan +01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val@0 IS NOT DISTINCT FROM val@1 +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +# Test the mixed condition join result +query IIII rowsort +SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.id = t2.id AND t1.val IS NOT DISTINCT FROM t2.val +---- +1 1 10 10 +2 2 NULL NULL + +# Test 3 table join +query IIII rowsort +SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val +JOIN t0 ON t1.val IS NOT DISTINCT FROM t0.val +---- +1 1 10 10 +2 2 NULL NULL +6 2 NULL NULL + +# Ensure there is HashJoin in the plan +query TT +EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +FROM t1 +JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val +JOIN t0 ON t1.val IS NOT DISTINCT FROM t0.val +---- +logical_plan +01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val +02)--Inner Join: t1.val = t0.val +03)----Inner Join: t1.val = t2.val +04)------TableScan: t1 projection=[id, val] +05)------TableScan: t2 projection=[id, val] +06)----TableScan: t0 projection=[val] +physical_plan +01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)], projection=[id@1, val@2, id@3, val@4], NullsEqual: true +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)--------CoalesceBatchesExec: target_batch_size=8192 +07)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true +08)------------DataSourceExec: partitions=1, partition_sizes=[1] +09)------------DataSourceExec: partitions=1, partition_sizes=[1] + +# Test IS NOT DISTINCT FROM with multiple columns +statement ok +CREATE TABLE t3 ( + id INT, + val1 INT, + val2 INT +) + +statement ok +CREATE TABLE t4 ( + id INT, + val1 INT, + val2 INT +) + +statement ok +INSERT INTO t3 VALUES +(1, 10, 100), +(2, NULL, 200), +(3, 30, NULL) + +statement ok +INSERT INTO t4 VALUES +(1, 10, 100), +(2, NULL, 200), +(3, 30, NULL) + +# Test multiple IS NOT DISTINCT FROM conditions - should produce HashJoin +query TT rowsort +EXPLAIN SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +FROM t3 +JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2) +---- +01)Projection: t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4 as val1, val2@2 as val2, val2@5 as val2] +02)--CoalesceBatchesExec: target_batch_size=8192 +02)--Inner Join: t3.val1 = t4.val1, t3.val2 = t4.val2 +03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1), (val2@2, val2@2)], NullsEqual: true +03)----TableScan: t3 projection=[id, val1, val2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +04)----TableScan: t4 projection=[id, val1, val2] +05)------DataSourceExec: partitions=1, partition_sizes=[1] +logical_plan +physical_plan + +# Test the multiple IS NOT DISTINCT FROM join result +query IIIIII +SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +FROM t3 +JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2) +---- +1 1 10 10 100 100 +2 2 NULL NULL 200 200 +3 3 30 30 NULL NULL + +statement ok +drop table t0; + +statement ok +drop table t1; + +statement ok +drop table t2; + +statement ok +drop table t3; + +statement ok +drop table t4; diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index d6b7559111..ae82aee5e1 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -853,7 +853,7 @@ physical_plan 01)ProjectionExec: expr=[1 as foo] 02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1 03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], preserve_partitioning=[true] -04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-2.parquet]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-2.parquet]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilter [ empty ] query I with selection as ( diff --git a/datafusion/sqllogictest/test_files/pipe_operator.slt b/datafusion/sqllogictest/test_files/pipe_operator.slt new file mode 100644 index 0000000000..6b92df9431 --- /dev/null +++ b/datafusion/sqllogictest/test_files/pipe_operator.slt @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# BigQuery supports the pipe operator syntax +# TODO: Make the Generic dialect support the pipe operator syntax +statement ok +set datafusion.sql_parser.dialect = 'BigQuery'; + +statement ok +CREATE TABLE test( + a INT, + b FLOAT, + c VARCHAR, + n VARCHAR +) AS VALUES + (1, 1.1, 'a', NULL), + (2, 2.2, 'b', NULL), + (3, 3.3, 'c', NULL) +; + +# WHERE pipe +query IRTT +SELECT * +FROM test +|> WHERE a > 1 +---- +2 2.2 b NULL +3 3.3 c NULL + +# ORDER BY pipe +query IRTT +SELECT * +FROM test +|> ORDER BY a DESC +---- +3 3.3 c NULL +2 2.2 b NULL +1 1.1 a NULL + +# ORDER BY pipe, limit +query IRTT +SELECT * +FROM test +|> ORDER BY a DESC +|> LIMIT 1 +---- +3 3.3 c NULL + +# SELECT pipe +query I +SELECT * +FROM test +|> SELECT a +---- +1 +2 +3 + +# EXTEND pipe +query IRR +SELECT * +FROM test +|> SELECT a, b +|> EXTEND a + b AS a_plus_b +---- +1 1.1 2.1 +2 2.2 4.2 +3 3.3 6.3 + +query IRR +SELECT * +FROM test +|> SELECT a, b +|> where a = 1 +|> EXTEND a + b AS a_plus_b +---- +1 1.1 2.1 diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index 3a6faf4654..47095d92d9 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -310,7 +310,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k@0, k@0)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/small_table.parquet]]}, projection=[k], file_type=parquet 04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilterPhysicalExpr [ true ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilter [ empty ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[] statement ok drop table small_table; diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt index e5c69cfbb8..dc6c33caa9 100644 --- a/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt +++ b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt @@ -23,15 +23,128 @@ ## Original Query: SELECT make_dt_interval(1, 12, 30, 01.001001); ## PySpark 3.5.5 Result: {'make_dt_interval(1, 12, 30, 1.001001)': datetime.timedelta(days=1, seconds=45001, microseconds=1001), 'typeof(make_dt_interval(1, 12, 30, 1.001001))': 'interval day to second', 'typeof(1)': 'int', 'typeof(12)': 'int', 'typeof(30)': 'int', 'typeof(1.001001)': 'decimal(7,6)'} -#query -#SELECT make_dt_interval(1::int, 12::int, 30::int, 1.001001::decimal(7,6)); +query ? +SELECT make_dt_interval(1::int, 12::int, 30::int, 1.001001::decimal(7,6)); +---- +1 days 12 hours 30 mins 1.001001 secs ## Original Query: SELECT make_dt_interval(100, null, 3); ## PySpark 3.5.5 Result: {'make_dt_interval(100, NULL, 3, 0.000000)': None, 'typeof(make_dt_interval(100, NULL, 3, 0.000000))': 'interval day to second', 'typeof(100)': 'int', 'typeof(NULL)': 'void', 'typeof(3)': 'int'} -#query -#SELECT make_dt_interval(100::int, NULL::void, 3::int); +query ? +SELECT make_dt_interval(100::int, NULL, 3::int); +---- +NULL ## Original Query: SELECT make_dt_interval(2); ## PySpark 3.5.5 Result: {'make_dt_interval(2, 0, 0, 0.000000)': datetime.timedelta(days=2), 'typeof(make_dt_interval(2, 0, 0, 0.000000))': 'interval day to second', 'typeof(2)': 'int'} -#query -#SELECT make_dt_interval(2::int); +query ? +SELECT make_dt_interval(2::int); +---- +2 days 0 hours 0 mins 0.000000 secs + +# null +query ? +SELECT (make_dt_interval(null, 0, 0, 0)) +---- +NULL + +query ? +SELECT (make_dt_interval(0, null, 0, 0)) +---- +NULL + +query ? +SELECT (make_dt_interval(0, 0, null, 0)) +---- +NULL + +query ? +SELECT (make_dt_interval(0, 0, 0, null)) +---- +NULL + +# missing params +query ? +SELECT (make_dt_interval()) AS make_dt_interval +---- +0 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(1)) AS make_dt_interval +---- +1 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(1, 1)) AS make_dt_interval +---- +1 days 1 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(1, 1, 1)) AS make_dt_interval +---- +1 days 1 hours 1 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(1, 1, 1, 1)) AS make_dt_interval +---- +1 days 1 hours 1 mins 1.000000 secs + + +# all 0 values +query ? +SELECT (make_dt_interval(0, 0, 0, 0)) +---- +0 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(-1, 24, 0, 0)) df +---- +0 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(1, -24, 0, 0)) dt +---- +0 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT (make_dt_interval(0, 0, 0, 0.1)) +---- +0 days 0 hours 0 mins 0.100000 secs + + +# doctest https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.make_dt_interval.html +# extract only the value make_dt_interval + +query ? +SELECT MAKE_DT_INTERVAL(day) AS interval_val +FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec); +---- +1 days 0 hours 0 mins 0.000000 secs + +query ? +SELECT MAKE_DT_INTERVAL(day, hour) AS interval_val +FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec); +---- +1 days 12 hours 0 mins 0.000000 secs + +query ? +SELECT MAKE_DT_INTERVAL(day, hour, min) AS interval_val +FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec); +---- +1 days 12 hours 30 mins 0.000000 secs + +query ? +SELECT MAKE_DT_INTERVAL(day, hour, min, sec) AS interval_val +FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec); +---- +1 days 12 hours 30 mins 1.001001 secs + +query ? +SELECT MAKE_DT_INTERVAL(1, 12, 30, 1.001001) +---- +1 days 12 hours 30 mins 1.001001 secs + +query ? +SELECT MAKE_DT_INTERVAL(1, 12, 30, 1.001001); +---- +1 days 12 hours 30 mins 1.001001 secs diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt new file mode 100644 index 0000000000..d6c5199b87 --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file was originally created by a porting script from: +# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function +# This file is part of the implementation of the datafusion-spark function library. +# For more information, please see: +# https://github.com/apache/datafusion/issues/15914 + +query IIIIIIR? +SELECT + y, m, w, d, h, mi, s, + make_interval(y, m, w, d, h, mi, s) AS interval +FROM VALUES + (NULL,2, 3, 4, 5, 6, 7.5), + (1, NULL,3, 4, 5, 6, 7.5), + (1, 2, NULL,4, 5, 6, 7.5), + (1, 2, 3, NULL,5, 6, 7.5), + (1, 2, 3, 4, NULL,6, 7.5), + (1, 2, 3, 4, 5, NULL,7.5), + (1, 2, 3, 4, 5, 6, CAST(NULL AS DOUBLE)), + (1, 1, 1, 1, 1, 1, 1.0) +AS v(y, m, w, d, h, mi, s); +---- +NULL 2 3 4 5 6 7.5 NULL +1 NULL 3 4 5 6 7.5 NULL +1 2 NULL 4 5 6 7.5 NULL +1 2 3 NULL 5 6 7.5 NULL +1 2 3 4 NULL 6 7.5 NULL +1 2 3 4 5 NULL 7.5 NULL +1 2 3 4 5 6 NULL NULL +1 1 1 1 1 1 1 13 mons 8 days 1 hours 1 mins 1.000000000 secs + +query IIIIIIR? +SELECT + y, m, w, d, h, mi, s, + make_interval(y, m, w, d, h, mi, s) AS interval +FROM VALUES + (0, 0, 0, 0, 0, 0, arrow_cast('NaN','Float64')) +AS v(y, m, w, d, h, mi, s); +---- +0 0 0 0 0 0 NaN NULL + +query IIIIIIR? +SELECT + y, m, w, d, h, mi, s, + make_interval(y, m, w, d, h, mi, s) AS interval +FROM VALUES + (0, 0, 0, 0, 0, 0, CAST('Infinity' AS DOUBLE)) +AS v(y, m, w, d, h, mi, s); +---- +0 0 0 0 0 0 Infinity NULL + +query IIIIIIR? +SELECT + y, m, w, d, h, mi, s, + make_interval(y, m, w, d, h, mi, s) AS interval +FROM VALUES + (0, 0, 0, 0, 0, 0, CAST('-Infinity' AS DOUBLE)) +AS v(y, m, w, d, h, mi, s); +---- +0 0 0 0 0 0 -Infinity NULL + +query ? +SELECT make_interval(2147483647, 1, 0, 0, 0, 0, 0.0); +---- +NULL + +query ? +SELECT make_interval(0, 0, 2147483647, 1, 0, 0, 0.0); +---- +NULL + +query ? +SELECT make_interval(0, 0, 0, 0, 2147483647, 1, 0.0); +---- +NULL + +# Intervals being rendered as empty string, see issue: +# https://github.com/apache/datafusion/issues/17455 +# We expect something like 0.00 secs with query ? +query T +SELECT make_interval(0, 0, 0, 0, 0, 0, 0.0) || ''; +---- +(empty) + +# Intervals being rendered as empty string, see issue: +# https://github.com/apache/datafusion/issues/17455 +# We expect something like 0.00 secs with query ? +query T +SELECT make_interval() || ''; +---- +(empty) + +query ? +SELECT INTERVAL '1' SECOND AS iv; +---- +1.000000000 secs diff --git a/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt b/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt new file mode 100644 index 0000000000..a26b0435c9 --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Spark doctests +query ? +SELECT map_from_arrays(array(1.0, 3.0), array('2', '4')); +---- +{1.0: 2, 3.0: 4} + +query ? +SELECT map_from_arrays(array(2, 5), array('a', 'b')); +---- +{2: a, 5: b} + +query ? +SELECT map_from_arrays(array(1, 2), array('a', NULL)); +---- +{1: a, 2: NULL} + +query ? +SELECT map_from_arrays(cast(array() as array), cast(array() as array)); +---- +{} + +# Tests with DataType:Null input arrays +query ? +SELECT map_from_arrays(NULL, NULL); +---- +NULL + +query ? +SELECT map_from_arrays(array(1), NULL); +---- +NULL + +query ? +SELECT map_from_arrays(NULL, array(1)); +---- +NULL + +# Tests with different inner lists lengths +query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths +SELECT map_from_arrays(array(1, 2, 3), array('a', 'b')); + +query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths +SELECT map_from_arrays(array(), array('a', 'b')); + +query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths +SELECT map_from_arrays(array(1, 2, 3), array()); + +query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths +select map_from_arrays(a, b) +from values + (array[1], array[1]), + (array[2, 3, 4], array[2, 3]), + (array[5], array[4]) +as tab(a, b); + +#Test with multiple rows: good, empty and nullable +query ? +select map_from_arrays(a, b) +from values + (array[1], array['a']), + (NULL, NULL), + (array[1,2,3], NULL), + (NULL, array['b', 'c']), + (array[4, 5], array['d', 'e']), + (array[], array[]), + (array[6, 7, 8], array['f', 'g', 'h']) +as tab(a, b); +---- +{1: a} +NULL +NULL +NULL +{4: d, 5: e} +{} +{6: f, 7: g, 8: h} + +# Test with complex types +query ? +SELECT map_from_arrays(array(array('a', 'b'), array('c', 'd')), array(struct(1, 2, 3), struct(4, 5, 6))); +---- +{[a, b]: {c0: 1, c1: 2, c2: 3}, [c, d]: {c0: 4, c1: 5, c2: 6}} + +# Test with nested function calls +query ? +SELECT + map_from_arrays( + array['outer_key1', 'outer_key2'], + array[ + -- value for outer_key1: a map itself + map_from_arrays( + array['inner_a', 'inner_b'], + array[1, 2] + ), + -- value for outer_key2: another map + map_from_arrays( + array['inner_x', 'inner_y', 'inner_z'], + array[10, 20, 30] + ) + ] + ) AS nested_map; +---- +{outer_key1: {inner_a: 1, inner_b: 2}, outer_key2: {inner_x: 10, inner_y: 20, inner_z: 30}} + +# Test with duplicate keys +query ? +SELECT map_from_arrays(array(true, false, true), array('a', NULL, 'b')); +---- +{false: NULL, true: b} + +# Tests with different list types +query ? +SELECT map_from_arrays(arrow_cast(array(2, 5), 'LargeList(Int32)'), arrow_cast(array('a', 'b'), 'FixedSizeList(2, Utf8)')); +---- +{2: a, 5: b} + +query ? +SELECT map_from_arrays(arrow_cast(array('a', 'b', 'c'), 'FixedSizeList(3, Utf8)'), arrow_cast(array(1, 2, 3), 'LargeList(Int32)')); +---- +{a: 1, b: 2, c: 3} diff --git a/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt b/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt new file mode 100644 index 0000000000..19b46886a0 --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt @@ -0,0 +1,164 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Spark doctests +query ? +SELECT map_from_entries(array[struct(1, 'a'), struct(2, 'b')]); +---- +{1: a, 2: b} + +query ? +SELECT map_from_entries(array[struct(1, cast(null as string)), struct(2, 'b')]); +---- +{1: NULL, 2: b} + +query ? +SELECT map_from_entries(data) +from values + (array[struct(1, 'a'), struct(2, 'b')]), + (array[struct(3, 'c')]) +as tab(data); +---- +{1: a, 2: b} +{3: c} + +# Tests with NULL and empty input structarrays +query ? +SELECT map_from_entries(data) +from values + (cast(array[] as array>)), + (cast(NULL as array>)) +as tab(data); +---- +{} +NULL + +# Test with NULL key, should fail +query error DataFusion error: Arrow error: Invalid argument error: Found unmasked nulls for non-nullable StructArray field "key" +SELECT map_from_entries(array[struct(NULL, 1)]); + +# Tests with NULL and array of Null type, should fail +query error DataFusion error: Execution error: map_from_entries: expected array>, got Null +SELECT map_from_entries(NULL); + +query error DataFusion error: Execution error: map_from_entries: expected array>, got Null +SELECT map_from_entries(array[NULL]); + +# Test with NULL array and NULL entries in arrays +# output is NULL if any entry is NULL +query ? +SELECT map_from_entries(data) +from values + ( + array[ + struct(1 as a, 'a' as b), + cast(NULL as struct), + cast(NULL as struct) + ] + ), + (NULL), + ( + array[ + struct(2 as a, 'b' as b), + struct(3 as a, 'c' as b) + ] + ), + ( + array[ + struct(4 as a, 'd' as b), + cast(NULL as struct), + struct(5 as a, 'e' as b), + struct(6 as a, 'f' as b) + ] + ) +as tab(data); +---- +NULL +NULL +{2: b, 3: c} +NULL + +#Test with multiple rows: good, empty and nullable +query ? +SELECT map_from_entries(data) +from values + (NULL), + (array[ + struct(1 as a, 'b' as b), + struct(2 as a, cast(NULL as string) as b), + struct(3 as a, 'd' as b) + ]), + (array[]), + (NULL) +as tab(data); +---- +NULL +{1: b, 2: NULL, 3: d} +{} +NULL + +# Test with complex types +query ? +SELECT map_from_entries(array[ + struct(array('a', 'b'), struct(1, 2, 3)), + struct(array('c', 'd'), struct(4, 5, 6)) +]); +---- +{[a, b]: {c0: 1, c1: 2, c2: 3}, [c, d]: {c0: 4, c1: 5, c2: 6}} + +# Test with nested function calls +query ? +SELECT + map_from_entries( + array[ + struct( + 'outer_key1', + -- value for outer_key1: a map itself + map_from_entries( + array[ + struct('inner_a', 1), + struct('inner_b', 2) + ] + ) + ), + struct( + 'outer_key2', + -- value for outer_key2: another map + map_from_entries( + array[ + struct('inner_x', 10), + struct('inner_y', 20), + struct('inner_z', 30) + ] + ) + ) + ] + ) AS nested_map; +---- +{outer_key1: {inner_a: 1, inner_b: 2}, outer_key2: {inner_x: 10, inner_y: 20, inner_z: 30}} + +# Test with duplicate keys +query ? +SELECT map_from_entries(array( + struct(true, 'a'), + struct(false, 'b'), + struct(true, 'c'), + struct(false, cast(NULL as string)), + struct(true, 'd') +)); +---- +{false: NULL, true: d} diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 79b783f89a..f602dbb54b 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -303,6 +303,26 @@ SELECT regexp_replace(arrow_cast('foobar', 'Dictionary(Int32, Utf8)'), 'bar', 'x ---- fooxx +query T +SELECT regexp_replace(arrow_cast('foobar', 'LargeUtf8'), 'bar', 'xx', 'gi') +---- +fooxx + +query T +SELECT regexp_replace(arrow_cast('foobar', 'Utf8View'), 'bar', 'xx', 'gi') +---- +fooxx + +query T +SELECT regexp_replace('foobar', arrow_cast('bar', 'LargeUtf8'), 'xx', 'gi') +---- +fooxx + +query T +SELECT regexp_replace('foobar', arrow_cast('bar', 'Utf8View'), 'xx', 'gi') +---- +fooxx + query T SELECT repeat('foo', 3) ---- diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index a72c8f5744..7d10a0615d 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -804,7 +804,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k +01)Projection: regexp_replace(test.column1_utf8view, Utf8View("^https?://(?:www\.)?([^/]+)/.*$"), Utf8View("\1")) AS k 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REPEAT diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index f8b7ccc6ae..1a7ff41d64 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -3616,3 +3616,36 @@ SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(Second, None)') AS a FR ---- 1970-01-01T00:00:01 1970-01-01T00:00:01 + +query TTTTT +SELECT + arrow_typeof(a), + CAST(a AS varchar), + arrow_cast(a, 'Utf8'), + arrow_cast(a, 'Utf8View'), + arrow_cast(a, 'LargeUtf8') +FROM (SELECT DATE '2005-09-10' AS a) +---- +Date32 2005-09-10 2005-09-10 2005-09-10 2005-09-10 + +query TTTTT +SELECT + arrow_typeof(a), + CAST(a AS varchar), + arrow_cast(a, 'Utf8'), + arrow_cast(a, 'Utf8View'), + arrow_cast(a, 'LargeUtf8') +FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a) +---- +Timestamp(Nanosecond, None) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 + +query TTTTT +SELECT + arrow_typeof(a), + CAST(a AS varchar), + arrow_cast(a, 'Utf8'), + arrow_cast(a, 'Utf8View'), + arrow_cast(a, 'LargeUtf8') +FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a) +---- +Timestamp(Nanosecond, Some("+00")) 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index ce59b02046..8a08cc17d4 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -316,7 +316,7 @@ explain select number, letter, age from partial_sorted order by number desc, let ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] # Explain variations of the above query with different orderings, and different sort prefixes. @@ -326,28 +326,28 @@ explain select number, letter, age from partial_sorted order by age desc limit 3 ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[age@2 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] query TT explain select number, letter, age from partial_sorted order by number desc, letter desc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] query TT explain select number, letter, age from partial_sorted order by number asc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] query TT explain select number, letter, age from partial_sorted order by letter asc, number desc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[letter@1 ASC NULLS LAST, number@0 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] # Explicit NULLS ordering cases (reversing the order of the NULLS on the number and letter orderings) query TT @@ -355,14 +355,14 @@ explain select number, letter, age from partial_sorted order by number desc, let ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC], preserve_partitioning=[false], sort_prefix=[number@0 DESC] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] query TT explain select number, letter, age from partial_sorted order by number desc NULLS LAST, letter asc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC NULLS LAST, letter@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify that the sort prefix is correctly computed on the normalized ordering (removing redundant aliased columns) @@ -372,7 +372,7 @@ explain select number, letter, age, number as column4, letter as column5 from pa physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[number@0 as number, letter@1 as letter, age@2 as age, number@0 as column4, letter@1 as column5] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify that the sort prefix is correctly computed over normalized, order-maintaining projections (number + 1, number, number + 1, age) query TT diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 996ba0d70a..1f7605d220 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -308,7 +308,7 @@ logical_plan physical_plan 01)UnionExec 02)--CoalesceBatchesExec: target_batch_size=2 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, CAST(t2.id AS Int32)@2), (name@1, name@1)] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, CAST(t2.id AS Int32)@2), (name@1, name@1)], NullsEqual: true 04)------CoalescePartitionsExec 05)--------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[] 06)----------CoalesceBatchesExec: target_batch_size=2 @@ -321,7 +321,7 @@ physical_plan 13)----------DataSourceExec: partitions=1, partition_sizes=[1] 14)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name] 15)----CoalesceBatchesExec: target_batch_size=2 -16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1] +16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true 17)--------CoalescePartitionsExec 18)----------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)] 19)------------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[] @@ -378,7 +378,7 @@ logical_plan physical_plan 01)UnionExec 02)--CoalesceBatchesExec: target_batch_size=2 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true 04)------CoalescePartitionsExec 05)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] 06)----------CoalesceBatchesExec: target_batch_size=2 @@ -389,7 +389,7 @@ physical_plan 11)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 12)--------DataSourceExec: partitions=1, partition_sizes=[1] 13)--CoalesceBatchesExec: target_batch_size=2 -14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)] +14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true 15)------CoalescePartitionsExec 16)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] 17)----------CoalesceBatchesExec: target_batch_size=2 diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index c302582344..e81662a753 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -6034,3 +6034,92 @@ LIMIT 5 0 2 NULL NULL 0 NULL NULL 0 3 NULL NULL 0 NULL NULL 0 4 NULL NULL 0 NULL NULL + +# regression test for https://github.com/apache/datafusion/issues/17401 +query I +WITH source AS ( + SELECT + 1 AS n, + '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8, + '' AS a9, '' AS a10, '' AS a11, '' AS a12 +) +SELECT + sum(n) OVER (PARTITION BY + a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12 + ) +FROM source; +---- +1 + +# regression test for https://github.com/apache/datafusion/issues/17401 +query I +WITH source AS ( + SELECT + 1 AS n, + '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8, + '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16, + '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24, + '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32, + '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40 +) +SELECT + sum(n) OVER (PARTITION BY + a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, + a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40 + ) +FROM source; +---- +1 + +# regression test for https://github.com/apache/datafusion/issues/17401 +query I +WITH source AS ( + SELECT + 1 AS n, + '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8, + '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16, + '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24, + '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32, + '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40 +) +SELECT + sum(n) OVER (PARTITION BY + a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, + a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40 + ) +FROM ( + SELECT * FROM source + ORDER BY a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, + a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40 +); +---- +1 + +# regression test for https://github.com/apache/datafusion/issues/17401 +query I +WITH source AS ( + SELECT + 1 AS n, + '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8, + '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16, + '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24, + '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32, + '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40 +) +SELECT + sum(n) OVER (PARTITION BY + a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, + a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40 + ) +FROM ( + SELECT * FROM source + WHERE a1 = '' AND a2 = '' AND a3 = '' AND a4 = '' AND a5 = '' AND a6 = '' AND a7 = '' AND a8 = '' + AND a9 = '' AND a10 = '' AND a11 = '' AND a12 = '' AND a13 = '' AND a14 = '' AND a15 = '' AND a16 = '' + AND a17 = '' AND a18 = '' AND a19 = '' AND a20 = '' AND a21 = '' AND a22 = '' AND a23 = '' AND a24 = '' + AND a25 = '' AND a26 = '' AND a27 = '' AND a28 = '' AND a29 = '' AND a30 = '' AND a31 = '' AND a32 = '' + AND a33 = '' AND a34 = '' AND a35 = '' AND a36 = '' AND a37 = '' AND a38 = '' AND a39 = '' AND a40 = '' + ORDER BY a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, + a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40 +); +---- +1 \ No newline at end of file diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md index 8e7f99b7df..d18d7bda5e 100644 --- a/datafusion/substrait/README.md +++ b/datafusion/substrait/README.md @@ -19,9 +19,12 @@ # Apache DataFusion Substrait -This crate contains a [Substrait] producer and consumer for [Apache DataFusion] +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. + +This crate is a submodule of DataFusion that provides a [Substrait] producer and consumer for DataFusion plans. See [API Docs] for details and examples. +[apache arrow]: https://arrow.apache.org/ +[apache datafusion]: https://datafusion.apache.org/ [substrait]: https://substrait.io -[apache datafusion]: https://datafusion.apache.org [api docs]: https://docs.rs/datafusion-substrait/latest diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs index d054e52675..dc7a5935c0 100644 --- a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs +++ b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs @@ -25,7 +25,8 @@ use crate::variation_const::{ INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF, TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF, - TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, + TIMESTAMP_SECOND_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF, + TIME_64_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF, }; use datafusion::arrow::array::{new_empty_array, AsArray, MapArray}; @@ -155,6 +156,45 @@ pub(crate) fn from_substrait_literal( } }, Some(LiteralType::Date(d)) => ScalarValue::Date32(Some(*d)), + Some(LiteralType::PrecisionTime(pt)) => match pt.precision { + 0 => match lit.type_variation_reference { + TIME_32_TYPE_VARIATION_REF => { + ScalarValue::Time32Second(Some(pt.value as i32)) + } + others => { + return substrait_err!("Unknown type variation reference {others}"); + } + }, + 3 => match lit.type_variation_reference { + TIME_32_TYPE_VARIATION_REF => { + ScalarValue::Time32Millisecond(Some(pt.value as i32)) + } + others => { + return substrait_err!("Unknown type variation reference {others}"); + } + }, + 6 => match lit.type_variation_reference { + TIME_64_TYPE_VARIATION_REF => { + ScalarValue::Time64Microsecond(Some(pt.value)) + } + others => { + return substrait_err!("Unknown type variation reference {others}"); + } + }, + 9 => match lit.type_variation_reference { + TIME_64_TYPE_VARIATION_REF => { + ScalarValue::Time64Nanosecond(Some(pt.value)) + } + others => { + return substrait_err!("Unknown type variation reference {others}"); + } + }, + p => { + return not_impl_err!( + "Unsupported Substrait precision {p} for PrecisionTime" + ); + } + }, Some(LiteralType::String(s)) => match lit.type_variation_reference { DEFAULT_CONTAINER_TYPE_VARIATION_REF => ScalarValue::Utf8(Some(s.clone())), LARGE_CONTAINER_TYPE_VARIATION_REF => ScalarValue::LargeUtf8(Some(s.clone())), diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs index d701827671..7358f1422f 100644 --- a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs +++ b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs @@ -93,7 +93,7 @@ pub async fn from_substrait_rex( consumer.consume_dynamic_parameter(expr, input_schema).await } }, - None => substrait_err!("Expression must set rex_type: {:?}", expression), + None => substrait_err!("Expression must set rex_type: {expression:?}"), } } diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs index f7e4c2bb0f..917bcc0077 100644 --- a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs +++ b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs @@ -90,13 +90,12 @@ pub async fn from_subquery( ))) } other_type => substrait_err!( - "unimplemented type {:?} for set predicate", - other_type + "unimplemented type {other_type:?} for set predicate" ), } } other_type => { - substrait_err!("Subquery type {:?} not implemented", other_type) + substrait_err!("Subquery type {other_type:?} not implemented") } }, None => { diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs index ade8a4e77e..5681c92326 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs @@ -145,9 +145,11 @@ fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result Ok(JoinType::LeftSemi), join_rel::JoinType::LeftMark => Ok(JoinType::LeftMark), join_rel::JoinType::RightMark => Ok(JoinType::RightMark), + join_rel::JoinType::RightAnti => Ok(JoinType::RightAnti), + join_rel::JoinType::RightSemi => Ok(JoinType::RightSemi), _ => plan_err!("unsupported join type {substrait_join_type:?}"), } } else { - plan_err!("invalid join type variant {join_type:?}") + plan_err!("invalid join type variant {join_type}") } } diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs index 3ea318b214..48e93c04bb 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs @@ -221,7 +221,7 @@ pub async fn from_read_rel( .await } _ => { - not_impl_err!("Unsupported ReadType: {:?}", read.read_type) + not_impl_err!("Unsupported Readtype: {:?}", read.read_type) } } } diff --git a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs index 31f4866bdc..2c66e9f6b0 100644 --- a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs +++ b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs @@ -19,7 +19,8 @@ use crate::logical_plan::producer::{to_substrait_type, SubstraitProducer}; use crate::variation_const::{ DATE_32_TYPE_VARIATION_REF, DECIMAL_128_TYPE_VARIATION_REF, DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, - LARGE_CONTAINER_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, + LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF, + TIME_64_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF, }; use datafusion::arrow::array::{Array, GenericListArray, OffsetSizeTrait}; @@ -29,7 +30,7 @@ use substrait::proto::expression::literal::interval_day_to_second::PrecisionMode use substrait::proto::expression::literal::map::KeyValue; use substrait::proto::expression::literal::{ Decimal, IntervalCompound, IntervalDayToSecond, IntervalYearToMonth, List, - LiteralType, Map, PrecisionTimestamp, Struct, + LiteralType, Map, PrecisionTime, PrecisionTimestamp, Struct, }; use substrait::proto::expression::{Literal, RexType}; use substrait::proto::{r#type, Expression}; @@ -280,6 +281,34 @@ pub(crate) fn to_substrait_literal( }; (map, DEFAULT_CONTAINER_TYPE_VARIATION_REF) } + ScalarValue::Time32Second(Some(t)) => ( + LiteralType::PrecisionTime(PrecisionTime { + precision: 0, + value: *t as i64, + }), + TIME_32_TYPE_VARIATION_REF, + ), + ScalarValue::Time32Millisecond(Some(t)) => ( + LiteralType::PrecisionTime(PrecisionTime { + precision: 3, + value: *t as i64, + }), + TIME_32_TYPE_VARIATION_REF, + ), + ScalarValue::Time64Microsecond(Some(t)) => ( + LiteralType::PrecisionTime(PrecisionTime { + precision: 6, + value: *t, + }), + TIME_64_TYPE_VARIATION_REF, + ), + ScalarValue::Time64Nanosecond(Some(t)) => ( + LiteralType::PrecisionTime(PrecisionTime { + precision: 9, + value: *t, + }), + TIME_64_TYPE_VARIATION_REF, + ), ScalarValue::Struct(s) => ( LiteralType::Struct(Struct { fields: s @@ -398,6 +427,18 @@ mod tests { round_trip_literal(ScalarValue::TimestampNanosecond(ts, tz))?; } + // Test Time32 literals + round_trip_literal(ScalarValue::Time32Second(Some(45296)))?; + round_trip_literal(ScalarValue::Time32Second(None))?; + round_trip_literal(ScalarValue::Time32Millisecond(Some(45296789)))?; + round_trip_literal(ScalarValue::Time32Millisecond(None))?; + + // Test Time64 literals + round_trip_literal(ScalarValue::Time64Microsecond(Some(45296789123)))?; + round_trip_literal(ScalarValue::Time64Microsecond(None))?; + round_trip_literal(ScalarValue::Time64Nanosecond(Some(45296789123000)))?; + round_trip_literal(ScalarValue::Time64Nanosecond(None))?; + round_trip_literal(ScalarValue::List(ScalarValue::new_list_nullable( &[ScalarValue::Float32(Some(1.0))], &DataType::Float32, diff --git a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs index 4abd283a7e..917959ea7d 100644 --- a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs +++ b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs @@ -174,7 +174,7 @@ pub fn to_substrait_agg_measure( to_substrait_agg_measure(producer, expr, schema) } _ => internal_err!( - "Expression must be compatible with aggregation. Unsupported expression: {:?}. ExpressionType: {:?}", + "Expression must be compatible with aggregation. Unsupported expression: {:?}. Expressiontype: {}", expr, expr.variant_name() ), diff --git a/datafusion/substrait/src/logical_plan/producer/rel/join.rs b/datafusion/substrait/src/logical_plan/producer/rel/join.rs index 3dbac636fe..835d3ee37a 100644 --- a/datafusion/substrait/src/logical_plan/producer/rel/join.rs +++ b/datafusion/substrait/src/logical_plan/producer/rel/join.rs @@ -115,8 +115,7 @@ fn to_substrait_jointype(join_type: JoinType) -> join_rel::JoinType { JoinType::LeftSemi => join_rel::JoinType::LeftSemi, JoinType::LeftMark => join_rel::JoinType::LeftMark, JoinType::RightMark => join_rel::JoinType::RightMark, - JoinType::RightAnti | JoinType::RightSemi => { - unimplemented!() - } + JoinType::RightAnti => join_rel::JoinType::RightAnti, + JoinType::RightSemi => join_rel::JoinType::RightSemi, } } diff --git a/datafusion/substrait/src/logical_plan/producer/types.rs b/datafusion/substrait/src/logical_plan/producer/types.rs index d819c2042c..3da9269c5b 100644 --- a/datafusion/substrait/src/logical_plan/producer/types.rs +++ b/datafusion/substrait/src/logical_plan/producer/types.rs @@ -325,7 +325,7 @@ pub(crate) fn to_substrait_type( precision: *p as i32, })), }), - _ => not_impl_err!("Unsupported cast type: {dt:?}"), + _ => not_impl_err!("Unsupported cast type: {dt}"), } } @@ -446,7 +446,7 @@ mod tests { } fn round_trip_type(dt: DataType) -> Result<()> { - println!("Checking round trip of {dt:?}"); + println!("Checking round trip of {dt}"); // As DataFusion doesn't consider nullability as a property of the type, but field, // it doesn't matter if we set nullability to true or false here. diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 4990054ac7..ecf465dd3f 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -166,7 +166,7 @@ pub async fn from_substrait_rel( ), } } - _ => not_impl_err!("Unsupported RelType: {:?}", rel.rel_type), + _ => not_impl_err!("Unsupported Reltype: {:?}", rel.rel_type), } } diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 6ea0de9379..a92fc2957c 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -605,26 +605,30 @@ mod tests { #[tokio::test] async fn test_multiple_joins() -> Result<()> { let plan_str = test_plan_to_string("multiple_joins.json").await?; - assert_eq!( + assert_snapshot!( plan_str, - "Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Left Join: left.id = right.id\ - \n SubqueryAlias: left\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id, category]], aggr=[[]]\ - \n Values: (Int64(1), Utf8(\"info\")), (Int64(2), Utf8(\"low\"))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))\ - \n SubqueryAlias: right\ - \n Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\ - \n Values: (Int64(1)), (Int64(2))" + @r#" + Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third + Left Join: left.id = right.id + SubqueryAlias: left + Projection: left.id, left.count(Int64(1)), left.id:1, left.category, right.id AS id:2, right.count(Int64(1)) AS count(Int64(1)):1 + Left Join: left.id = right.id + SubqueryAlias: left + Projection: left.id, left.count(Int64(1)), right.id AS id:1, right.category + Left Join: left.id = right.id + SubqueryAlias: left + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + SubqueryAlias: right + Aggregate: groupBy=[[id, category]], aggr=[[]] + Values: (Int64(1), Utf8("info")), (Int64(2), Utf8("low")) + SubqueryAlias: right + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + SubqueryAlias: right + Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]] + Values: (Int64(1)), (Int64(2)) + "# ); Ok(()) } diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 616dc917ef..39e4984ab9 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -626,6 +626,66 @@ async fn roundtrip_exists_filter() -> Result<()> { Ok(()) } +#[tokio::test] +async fn roundtrip_not_exists_filter_left_anti_join() -> Result<()> { + let plan = generate_plan_from_sql( + "SELECT ba.isbn, ba.author FROM book_author ba WHERE NOT EXISTS (SELECT 1 FROM book b WHERE b.isbn = ba.isbn)", + false, + true, + ) + .await?; + + assert_snapshot!( + plan, + @r#" + LeftAnti Join: book_author.isbn = book.isbn + TableScan: book_author projection=[isbn, author] + TableScan: book projection=[isbn] + "# + ); + Ok(()) +} + +#[tokio::test] +async fn roundtrip_right_anti_join() -> Result<()> { + let plan = generate_plan_from_sql( + "SELECT * FROM book b RIGHT ANTI JOIN book_author ba ON b.isbn = ba.isbn", + false, + true, + ) + .await?; + + assert_snapshot!( + plan, + @r#" + RightAnti Join: book.isbn = book_author.isbn + TableScan: book projection=[isbn] + TableScan: book_author projection=[isbn, author] + "# + ); + Ok(()) +} + +#[tokio::test] +async fn roundtrip_right_semi_join() -> Result<()> { + let plan = generate_plan_from_sql( + "SELECT * FROM book b RIGHT SEMI JOIN book_author ba ON b.isbn = ba.isbn", + false, + true, + ) + .await?; + + assert_snapshot!( + plan, + @r#" + RightSemi Join: book.isbn = book_author.isbn + TableScan: book projection=[isbn] + TableScan: book_author projection=[isbn, author] + "# + ); + Ok(()) +} + #[tokio::test] async fn inner_join() -> Result<()> { let plan = generate_plan_from_sql( @@ -1475,7 +1535,7 @@ fn check_post_join_filters(rel: &Rel) -> Result<()> { } Some(RelType::ExtensionLeaf(_)) | Some(RelType::Read(_)) => Ok(()), _ => not_impl_err!( - "Unsupported RelType: {:?} in post join filter check", + "Unsupported Reltype: {:?} in post join filter check", rel.rel_type ), } @@ -1751,6 +1811,34 @@ async fn create_context() -> Result { ctx.register_csv("data2", "tests/testdata/data.csv", CsvReadOptions::new()) .await?; + // Register test tables for anti join tests + let book_fields = vec![ + Field::new("isbn", DataType::Int64, false), + Field::new("title", DataType::Utf8, true), + Field::new("genre", DataType::Utf8, true), + ]; + let book_schema = Schema::new(book_fields); + let mut book_options = CsvReadOptions::new(); + book_options.schema = Some(&book_schema); + book_options.has_header = false; + ctx.register_csv("book", "tests/testdata/empty.csv", book_options) + .await?; + + let book_author_fields = vec![ + Field::new("isbn", DataType::Int64, true), + Field::new("author", DataType::Utf8, true), + ]; + let book_author_schema = Schema::new(book_author_fields); + let mut book_author_options = CsvReadOptions::new(); + book_author_options.schema = Some(&book_author_schema); + book_author_options.has_header = false; + ctx.register_csv( + "book_author", + "tests/testdata/empty.csv", + book_author_options, + ) + .await?; + Ok(ctx) } diff --git a/datafusion/substrait/tests/cases/substrait_validations.rs b/datafusion/substrait/tests/cases/substrait_validations.rs index a31b3ca385..c8cc3fe994 100644 --- a/datafusion/substrait/tests/cases/substrait_validations.rs +++ b/datafusion/substrait/tests/cases/substrait_validations.rs @@ -51,7 +51,7 @@ mod tests { let ctx = SessionContext::new(); ctx.register_table( table_ref, - Arc::new(EmptyTable::new(df_schema.inner().clone())), + Arc::new(EmptyTable::new(Arc::clone(df_schema.inner()))), )?; Ok(ctx) } diff --git a/datafusion/substrait/tests/utils.rs b/datafusion/substrait/tests/utils.rs index e3e3ec3fab..f84594312b 100644 --- a/datafusion/substrait/tests/utils.rs +++ b/datafusion/substrait/tests/utils.rs @@ -150,7 +150,7 @@ pub mod test { let df_schema = from_substrait_named_struct(self.consumer, substrait_schema)? .replace_qualifier(table_reference.clone()); - let table = EmptyTable::new(df_schema.inner().clone()); + let table = EmptyTable::new(Arc::clone(df_schema.inner())); self.schemas.push((table_reference, Arc::new(table))); Ok(()) } diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 293188b318..dca98a7e38 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -60,4 +60,4 @@ object_store = { workspace = true } # needs to be compiled tokio = { workspace = true } url = { workspace = true } -wasm-bindgen-test = "0.3.51" +wasm-bindgen-test = "0.3.54" diff --git a/datafusion/wasmtest/README.md b/datafusion/wasmtest/README.md index 70f4daef91..57a12ef8b8 100644 --- a/datafusion/wasmtest/README.md +++ b/datafusion/wasmtest/README.md @@ -32,7 +32,7 @@ Some of DataFusion's downstream projects compile to WASM to run in the browser. ## Setup -First, [install wasm-pack](https://rustwasm.github.io/wasm-pack/installer/) +First, [install wasm-pack](https://drager.github.io/wasm-pack/installer/) Then use wasm-pack to compile the crate from within this directory @@ -40,6 +40,20 @@ Then use wasm-pack to compile the crate from within this directory wasm-pack build ``` +### Apple silicon + +The default installation of Clang on Apple silicon does not support wasm, so you'll need to install LLVM Clang. For example via Homebrew: + +```sh +brew install llvm +# You will also need to install wasm-bindgen-cli separately, changing version as needed (0.3.53 = 0.2.103) +cargo install wasm-bindgen-cli@0.2.103 +# Need to run commands like so, unless you edit your PATH to prepend the LLVM version of Clang +PATH="/opt/homebrew/opt/llvm/bin:$PATH" RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build +``` + +- For reference: https://github.com/briansmith/ring/issues/1824 + ## Try it out The `datafusion-wasm-app` directory contains a simple app (created with [`create-wasm-app`](https://github.com/rustwasm/create-wasm-app) and then manually updated to WebPack 5) that invokes DataFusion and writes results to the browser console. diff --git a/dev/release/README.md b/dev/release/README.md index 5b51295efd..d70e256f73 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -311,47 +311,8 @@ Verify that the Cargo.toml in the tarball contains the correct version ### Publish datafusion-cli on Homebrew -Run `publish_homebrew.sh` to publish `datafusion-cli` on Homebrew. In order to do so it is necessary to -fork the `homebrew-core` repo https://github.com/Homebrew/homebrew-core/, have Homebrew installed on your -macOS/Linux/WSL2 and properly configured and have a Github Personal Access Token that has permission to file pull requests in the `homebrew-core` repo. - -#### Fork the `homebrew-core` repo - -Go to https://github.com/Homebrew/homebrew-core/ and fork the repo. - -#### Install and configure Homebrew - -Please visit https://brew.sh/ to obtain Homebrew. In addition to that please check out https://docs.brew.sh/Homebrew-on-Linux if you are on Linux or WSL2. - -Before running the script make sure that you can run the following command in your bash to make sure -that `brew` has been installed and configured properly: - -```shell -brew --version -``` - -#### Create a Github Personal Access Token - -To create a Github Personal Access Token, please visit https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token for instructions. - -- Make sure to select either **All repositories** or **Only selected repositories** so that you have access to **Repository permissions**. -- If you only use the token for selected repos make sure you include your - fork of `homebrew-core` in the list of repos under **Selected repositories**. -- Make sure to have **Read and write** access enabled for pull requests in your **Repository permissions**. - -After all of the above is complete execute the following command: - -```shell -dev/release/publish_homebrew.sh -``` - -Note that sometimes someone else has already submitted a PR to update the datafusion formula in homebrew. -In this case you will get an error with a message that your PR is a duplicate of an existing one. In this -case no further action is required. - -Alternatively manually submit a simple PR to update tag and commit hash for the datafusion -formula in homebrew-core. Here is an example PR: -https://github.com/Homebrew/homebrew-core/pull/89562. +[`datafusion` formula](https://formulae.brew.sh/formula/datafusion) is [updated automatically](https://github.com/Homebrew/homebrew-core/pulls?q=is%3Apr+datafusion+is%3Aclosed), +so no action is needed. ### Call the vote diff --git a/dev/release/publish_homebrew.sh b/dev/release/publish_homebrew.sh deleted file mode 100644 index 20955953e8..0000000000 --- a/dev/release/publish_homebrew.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ue - -if [ "$#" -ne 4 ]; then - echo "Usage: $0 " - exit 1 -fi - -version=$1 -github_user=$2 -github_token=$3 -# Prepare for possible renaming of the default branch on Homebrew -homebrew_default_branch_name=$4 - -# Git parallel fetch -if sysctl -n hw.ncpu 2>/dev/null; then # macOS - num_processing_units=$(sysctl -n hw.ncpu) -elif [ -x "$(command -v nproc)" ]; then # Linux - num_processing_units=$(nproc) -else # Fallback - num_processing_units=1 -fi - -url="https://www.apache.org/dyn/closer.lua?path=datafusion/datafusion-${version}/apache-datafusion-${version}.tar.gz" -sha256="$(curl https://dist.apache.org/repos/dist/release/datafusion/datafusion-${version}/apache-datafusion-${version}.tar.gz.sha256 | cut -d' ' -f1)" - -pushd "$(brew --repository homebrew/core)" - -if ! git remote | grep -q --fixed-strings ${github_user}; then - echo "Setting ''${github_user}' remote" - git remote add ${github_user} git@github.com:${github_user}/homebrew-core.git -fi - -echo "Updating working copy" -git fetch --all --prune --tags --force -j$num_processing_units - -branch=apache-datafusion-${version} -echo "Creating branch: ${branch}" -git branch -D ${branch} || : -git checkout -b ${branch} origin/master - -echo "Updating datafusion formulae" -brew bump-formula-pr \ - --commit \ - --no-audit \ - --sha256="${sha256}" \ - --url="${url}" \ - --verbose \ - --write-only \ - datafusion - -echo "Testing datafusion formulae" -brew uninstall datafusion || : -brew install --build-from-source datafusion -brew test datafusion -brew audit --strict datafusion - -git push -u $github_user ${branch} - -git checkout - - -popd - -echo "Create the pull request" -title="datafusion ${version}" -body="Created using \`bump-formula-pr\`" -data="{\"title\":\"$title\", \"body\":\"$body\", \"head\":\"$github_username:$branch\", \"base\":\"$homebrew_default_branch_name\"}" -curl -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $github_token" \ - https://api.github.com/repos/Homebrew/homebrew-core/pulls \ - -d "$data" - -echo "Complete!" diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh index af0fce72cc..8fe7220085 100755 --- a/dev/rust_lint.sh +++ b/dev/rust_lint.sh @@ -20,13 +20,21 @@ # This script runs all the Rust lints locally the same way the # DataFusion CI does +# For `.toml` format checking set -e if ! command -v taplo &> /dev/null; then echo "Installing taplo using cargo" cargo install taplo-cli fi +# For Apache licence header checking +if ! command -v hawkeye &> /dev/null; then + echo "Installing hawkeye using cargo" + cargo install hawkeye --locked +fi + ci/scripts/rust_fmt.sh ci/scripts/rust_clippy.sh ci/scripts/rust_toml_fmt.sh ci/scripts/rust_docs.sh +ci/scripts/license_header.sh \ No newline at end of file diff --git a/docs/scripts/update_committer_list.py b/docs/scripts/update_committer_list.py new file mode 100755 index 0000000000..c66eb52468 --- /dev/null +++ b/docs/scripts/update_committer_list.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +Utility for updating the committer list in the governance documentation +by reading from the Apache DataFusion phonebook and combining with existing data. +""" + +import re +import requests +import sys +import os +from typing import Dict, List, NamedTuple, Set + + +class Committer(NamedTuple): + name: str + apache: str + github: str + affiliation: str + role: str + + +# Return (pmc, committers) each a dictionary like +# key: apache id +# value: Real name + +def get_asf_roster(): + """Get the current roster from Apache phonebook.""" + # See https://home.apache.org/phonebook-about.html + committers_url = "https://whimsy.apache.org/public/public_ldap_projects.json" + + # people https://whimsy.apache.org/public/public_ldap_people.json + people_url = "https://whimsy.apache.org/public/public_ldap_people.json" + + try: + r = requests.get(committers_url) + r.raise_for_status() + j = r.json() + proj = j['projects']['datafusion'] + + # Get PMC members and committers + pmc_ids = set(proj['owners']) + committer_ids = set(proj['members']) - pmc_ids + + except Exception as e: + print(f"Error fetching ASF roster: {e}") + return set(), set() + + # Fetch people to get github handles and affiliations + # + # The data looks like this: + # { + # "lastCreateTimestamp": "20250913131506Z", + # "people_count": 9932, + # "people": { + # "a_budroni": { + # "name": "Alessandro Budroni", + # "createTimestamp": "20160720223917Z" + # }, + # ... + # } + try: + r = requests.get(people_url) + r.raise_for_status() + j = r.json() + people = j['people'] + + # make a dictionary with each pmc_id and value their real name + pmcs = {p: people[p]['name'] for p in pmc_ids} + committers = {c: people[c]['name'] for c in committer_ids} + + except Exception as e: + print(f"Error fetching ASF people: {e}") + + + return pmcs, committers + + + +def parse_existing_table(content: str) -> List[Committer]: + """Parse the existing committer table from the markdown content.""" + committers = [] + + # Find the table between the markers + start_marker = "" + end_marker = "" + + start_idx = content.find(start_marker) + end_idx = content.find(end_marker) + + if start_idx == -1 or end_idx == -1: + return committers + + table_content = content[start_idx:end_idx] + + # Parse table rows (skip header and separator) + lines = table_content.split('\n') + for line in lines: + line = line.strip() + if line.startswith('|') and '---' not in line and line.count('|') >= 4: + # Split by | and clean up + parts = [part.strip() for part in line.split('|')] + if len(parts) >= 5: + name = parts[1].strip() + apache = parts[2].strip() + github = parts[3].strip() + affiliation = parts[4].strip() + role = parts[5].strip() + + if name and name != 'Name' and (not '-----' in name): + committers.append(Committer(name, apache, github, affiliation, role)) + + return committers + + +def generate_table_row(committer: Committer) -> str: + """Generate a markdown table row for a committer.""" + github_link = f"[{committer.github}](https://github.com/{committer.github})" + return f"| {committer.name:<23} | {committer.apache:<39} |{committer.github:<39} | {committer.affiliation:<11} | {committer.role:<9} |" + + +def sort_committers(committers: List[Committer]) -> List[Committer]: + """Sort committers by role ('PMC Chair', PMC, Committer) then by apache id.""" + role_order = {'PMC Chair': 0, 'PMC': 1, 'Committer': 2} + + return sorted(committers, key=lambda c: (role_order.get(c.role, 3), c.apache.lower())) + + +def update_governance_file(file_path: str): + """Update the governance file with the latest committer information.""" + try: + with open(file_path, 'r') as f: + content = f.read() + except FileNotFoundError: + print(f"Error: File {file_path} not found") + return False + + # Parse existing committers + existing_committers = parse_existing_table(content) + print(f"Found {len(existing_committers)} existing committers") + + # Get ASF roster + asf_pmcs, asf_committers = get_asf_roster() + print(f"Found {len(asf_pmcs)} PMCs and {len(asf_committers)} committers in ASF roster") + + + # Create a map of existing committers by apache id + existing_by_apache = {c.apache: c for c in existing_committers} + + # Update the entries based on the ASF roster + updated_committers = [] + for apache_id, name in {**asf_pmcs, **asf_committers}.items(): + role = 'PMC' if apache_id in asf_pmcs else 'Committer' + if apache_id in existing_by_apache: + existing = existing_by_apache[apache_id] + # Preserve PMC Chair role if already set + if existing.role == 'PMC Chair': + role = 'PMC Chair' + updated_committers.append(Committer( + name=existing.name, + apache=apache_id, + github=existing.github, + affiliation=existing.affiliation, + role=role + )) + # add a new entry for new committers with placeholder values + else: + print(f"New entry found: {name} ({apache_id})") + # Placeholder github and affiliation + updated_committers.append(Committer( + name=name, + apache=apache_id, + github="", # user should update + affiliation="", # User should update + role=role + )) + + + # Sort the committers + sorted_committers = sort_committers(updated_committers) + + # Generate new table + table_lines = [ + "| Name | Apache ID | github | Affiliation | Role |", + "|-------------------------|-----------|----------------------------|-------------|-----------|" + ] + + for committer in sorted_committers: + table_lines.append(generate_table_row(committer)) + + new_table = '\n'.join(table_lines) + + # Replace the table in the content + start_marker = "" + end_marker = "" + + start_idx = content.find(start_marker) + end_idx = content.find(end_marker) + + if start_idx == -1 or end_idx == -1: + print("Error: Could not find table markers in file") + return False + + # Find the end of the start marker line + start_line_end = content.find('\n', start_idx) + 1 + + new_content = ( + content[:start_line_end] + + new_table + '\n' + + content[end_idx:] + ) + + # Write back to file + try: + with open(file_path, 'w') as f: + f.write(new_content) + print(f"Successfully updated {file_path}") + return True + except Exception as e: + print(f"Error writing file: {e}") + return False + + +def main(): + """Main function.""" + # Default path to governance file + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.dirname(script_dir) + governance_file = os.path.join(repo_root, "source", "contributor-guide", "governance.md") + + if len(sys.argv) > 1: + governance_file = sys.argv[1] + + if not os.path.exists(governance_file): + print(f"Error: Governance file not found at {governance_file}") + sys.exit(1) + + print(f"Updating committer list in {governance_file}") + + if update_governance_file(governance_file): + print("Committer list updated successfully") + else: + print("Failed to update committer list") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 3b1b86daac..0859beb788 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -84,3 +84,29 @@ Details: 8rem for search box etc*/ white-space: normal !important; } } + +/* Make wide tables scroll within the content area to avoid overlapping the + right sidebar. Prevents tables from bleeding underneath the sticky sidebar. */ +.bd-content table { + display: block; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + max-width: 100%; +} + +/* Restore proper table display to maintain column alignment */ +.bd-content table thead, +.bd-content table tbody { display: table-row-group; } + +.bd-content table tr { display: table-row; } + +.bd-content table th, +.bd-content table td { + display: table-cell; + white-space: normal; +} + +/* Maintain striped styling when table scrolls */ +.bd-content table tbody tr:nth-of-type(odd) { + background-color: rgba(0, 0, 0, 0.03); +} diff --git a/docs/source/contributor-guide/governance.md b/docs/source/contributor-guide/governance.md index 27ff90eb92..7cfc8c3b7c 100644 --- a/docs/source/contributor-guide/governance.md +++ b/docs/source/contributor-guide/governance.md @@ -19,10 +19,6 @@ # Governance -The current PMC and committers are listed in the [Apache Phonebook]. - -[apache phonebook]: https://projects.apache.org/committee.html?datafusion - ## Overview DataFusion is part of the [Apache Software Foundation] and is governed following @@ -38,6 +34,84 @@ As much as practicable, we strive to make decisions by consensus, and anyone in the community is encouraged to propose ideas, start discussions, and contribute to the project. +## People + +DataFusion is currently governed by the following individuals + + + + + +| Name | Apache ID | github | Affiliation | Role | +| ----------------------- | ---------------- | ------------------------------------------------------- | -------------- | --------- | +| Andrew Lamb | alamb | [alamb](https://github.com/alamb) | InfluxData | PMC Chair | +| Andrew Grove | agrove | [andygrove](https://github.com/andygrove) | Apple | PMC | +| Mustafa Akur | akurmustafa | [akurmustafa](https://github.com/akurmustafa) | OHSU | PMC | +| Berkay Şahin | berkay | [berkaysynnada](https://github.com/berkaysynnada) | Synnada | PMC | +| Oleksandr Voievodin | comphead | [comphead](https://github.com/comphead) | Apple | PMC | +| Daniël Heres | dheres | [Dandandan](https://github.com/Dandandan) | | PMC | +| QP Hou | houqp | [houqp](https://github.com/houqp) | | PMC | +| Jie Wen | jackwener | [jakevin](https://github.com/jackwener) | | PMC | +| Jay Zhan | jayzhan | [jayzhan211](https://github.com/jayzhan211) | | PMC | +| Jonah Gao | jonah | [jonahgao](https://github.com/jonahgao) | | PMC | +| Kun Liu | liukun | [liukun4515](https://github.com/liukun4515) | | PMC | +| Mehmet Ozan Kabak | ozankabak | [ozankabak](https://github.com/ozankabak) | Synnada, Inc | PMC | +| Tim Saucer | timsaucer | [timsaucer](https://github.com/timsaucer) | | PMC | +| L. C. Hsieh | viirya | [viirya](https://github.com/viirya) | Databricks | PMC | +| Ruihang Xia | wayne | [waynexia](https://github.com/waynexia) | Greptime | PMC | +| Wes McKinney | wesm | [wesm](https://github.com/wesm) | Posit | PMC | +| Will Jones | wjones127 | [wjones127](https://github.com/wjones127) | LanceDB | PMC | +| Xudong Wang | xudong963 | [xudong963](https://github.com/xudong963) | Polygon.io | PMC | +| Adrian Garcia Badaracco | adriangb | [adriangb](https://github.com/adriangb) | Pydantic | Committer | +| Brent Gardner | avantgardner | [avantgardnerio](https://github.com/avantgardnerio) | Coralogix | Committer | +| Dmitrii Blaginin | blaginin | [blaginin](https://github.com/blaginin) | SpiralDB | Committer | +| Piotr Findeisen | findepi | [findepi](https://github.com/findepi) | dbt Labs | Committer | +| Jax Liu | goldmedal | [goldmedal](https://github.com/goldmedal) | Canner | Committer | +| Huaxin Gao | huaxingao | [huaxingao](https://github.com/huaxingao) | | Committer | +| Ifeanyi Ubah | iffyio | [iffyio](https://github.com/iffyio) | Validio | Committer | +| Jeffrey Vo | jeffreyvo | [Jefffrey](https://github.com/Jefffrey) | | Committer | +| Liu Jiayu | jiayuliu | [jimexist](https://github.com/jimexist) | | Committer | +| Ruiqiu Cao | kamille | [Rachelint](https://github.com/Rachelint) | Tencent | Committer | +| Kazuyuki Tanimura | kazuyukitanimura | [kazuyukitanimura](https://github.com/kazuyukitanimura) | | Committer | +| Eduard Karacharov | korowa | [korowa](https://github.com/korowa) | | Committer | +| Siew Kam Onn | kosiew | [kosiew](https://github.com/kosiew) | | Committer | +| Lewis Zhang | linwei | [lewiszlw](https://github.com/lewiszlw) | diit.cn | Committer | +| Matt Butrovich | mbutrovich | [mbutrovich](https://github.com/mbutrovich) | Apple | Committer | +| Metehan Yildirim | mete | [metegenez](https://github.com/metegenez) | | Committer | +| Marko Milenković | milenkovicm | [milenkovicm](https://github.com/milenkovicm) | | Committer | +| Wang Mingming | mingmwang | [mingmwang](https://github.com/mingmwang) | | Committer | +| Michael Ward | mjward | [Michael-J-Ward ](https://github.com/Michael-J-Ward) | | Committer | +| Marco Neumann | mneumann | [crepererum](https://github.com/crepererum) | InfluxData | Committer | +| Zhong Yanghong | nju_yaho | [yahoNanJing](https://github.com/yahoNanJing) | | Committer | +| Paddy Horan | paddyhoran | [paddyhoran](https://github.com/paddyhoran) | Assured Allies | Committer | +| Parth Chandra | parthc | [parthchandra](https://github.com/parthchandra) | Apple | Committer | +| Rémi Dettai | rdettai | [rdettai](https://github.com/rdettai) | | Committer | +| Chao Sun | sunchao | [sunchao](https://github.com/sunchao) | OpenAI | Committer | +| Daniel Harris | thinkharderdev | [thinkharderdev](https://github.com/thinkharderdev) | Coralogix | Committer | +| Raphael Taylor-Davies | tustvold | [tustvold](https://github.com/tustvold) | | Committer | +| Weijun Huang | weijun | [Weijun-H](https://github.com/Weijun-H) | OrbDB | Committer | +| Yang Jiang | yangjiang | [Ted-jiang](https://github.com/Ted-jiang) | Ebay | Committer | +| Yijie Shen | yjshen | [yjshen](https://github.com/yjshen) | DataPelago | Committer | +| Yongting You | ytyou | [2010YOUY01](https://github.com/2010YOUY01) | Independent | Committer | +| Qi Zhu | zhuqi | [zhuqi-lucas](https://github.com/zhuqi-lucas) | Polygon.io | Committer | + + + +Note that the authoritative list of PMC and committers is the [Apache Phonebook] + +[apache phonebook]: https://projects.apache.org/committee.html?datafusion + ## Roles - **Contributors**: Anyone who contributes to the project, whether it be code, diff --git a/docs/source/contributor-guide/gsoc_application_guidelines.md b/docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md similarity index 99% rename from docs/source/contributor-guide/gsoc_application_guidelines.md rename to docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md index e8ca9703a5..c127b4231b 100644 --- a/docs/source/contributor-guide/gsoc_application_guidelines.md +++ b/docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md @@ -1,4 +1,4 @@ -# GSoC Application Guidelines +# GSoC Application Guidelines (2025) ## Introduction diff --git a/docs/source/contributor-guide/gsoc_project_ideas.md b/docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md similarity index 99% rename from docs/source/contributor-guide/gsoc_project_ideas.md rename to docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md index da6c24e292..d81d9eb9ad 100644 --- a/docs/source/contributor-guide/gsoc_project_ideas.md +++ b/docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md @@ -1,4 +1,4 @@ -# GSoC Project Ideas +# GSoC Project Ideas (2025) ## Introduction diff --git a/docs/source/contributor-guide/gsoc/index.rst b/docs/source/contributor-guide/gsoc/index.rst new file mode 100644 index 0000000000..10b0013e9b --- /dev/null +++ b/docs/source/contributor-guide/gsoc/index.rst @@ -0,0 +1,36 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Google Summer of Code (GSOC) +============================ + +DataFusion has participated in +`Google Summer of Code (GSOC) `_ +since 2025. GSOC is a global program that offers students stipends to +write code for open source projects. + +If you are a interested in contributing to DataFusion, we encourage you +to apply. You can find more information about the application process and +project ideas in the sections below. + + +.. toctree:: + :maxdepth: 1 + + gsoc_application_guidelines_2025 + gsoc_project_ideas_2025 + diff --git a/docs/source/index.rst b/docs/source/index.rst index 2fc7970f09..574c285b0e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,12 +51,14 @@ The following related subprojects target end users and have separate documentati queries. - `DataFusion Comet `_ is an accelerator for Apache Spark based on DataFusion. +- `DataFusion Ballista `_ is distributed processing extension for DataFusion. "Out of the box," DataFusion offers `SQL `_ and `Dataframe `_ APIs, excellent `performance `_, built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community. `Python Bindings `_ are also available. +`Ballista `_ is Apache DataFusion extension enabling the parallelized execution of workloads across multiple nodes in a distributed environment. DataFusion features a full query planner, a columnar, streaming, multi-threaded, vectorized execution engine, and partitioned data sources. You can @@ -155,8 +157,7 @@ To get started, see contributor-guide/governance contributor-guide/inviting contributor-guide/specification/index - contributor-guide/gsoc_application_guidelines - contributor-guide/gsoc_project_ideas + contributor-guide/gsoc/index .. _toc.subprojects: @@ -164,6 +165,6 @@ To get started, see :maxdepth: 1 :caption: DataFusion Subprojects - DataFusion Ballista + DataFusion Ballista DataFusion Comet DataFusion Python diff --git a/docs/source/library-user-guide/query-optimizer.md b/docs/source/library-user-guide/query-optimizer.md index 224510083f..877ff8c754 100644 --- a/docs/source/library-user-guide/query-optimizer.md +++ b/docs/source/library-user-guide/query-optimizer.md @@ -68,7 +68,7 @@ fn observer(plan: &LogicalPlan, rule: &dyn OptimizerRule) { ## Writing Optimization Rules Please refer to the -[optimizer_rule.rs](../../../datafusion-examples/examples/optimizer_rule.rs) +[optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/optimizer_rule.rs) example to learn more about the general approach to writing optimizer rules and then move onto studying the existing rules. diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 88d93f5c0d..d70413467a 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -19,10 +19,55 @@ # Upgrade Guides -## DataFusion `50.0.0` +## DataFusion `51.0.0` + +**Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. + +You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) + +### `MSRV` updated to 1.87.0 + +The Minimum Supported Rust Version (MSRV) has been updated to [`1.87.0`]. + +[`1.87.0`]: https://releases.rs/docs/1.87.0/ + +### `datafusion-proto` use `TaskContext` rather than `SessionContext` in physical plan serde methods + +There have been changes in the public API methods of `datafusion-proto` which handle physical plan serde. + +Methods like `physical_plan_from_bytes`, `parse_physical_expr` and similar, expect `TaskContext` instead of `SessionContext` + +```diff +- let plan2 = physical_plan_from_bytes(&bytes, &ctx)?; ++ let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?; +``` -**Note:** DataFusion `50.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. -You can see the current [status of the `50.0.0 `release here](https://github.com/apache/datafusion/issues/16799) +as `TaskContext` contains `RuntimeEnv` methods such as `try_into_physical_plan` will not have explicit `RuntimeEnv` parameter. + +```diff +let result_exec_plan: Arc = proto +- .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec) ++. .try_into_physical_plan(&ctx.task_ctx(), &composed_codec) +``` + +`PhysicalExtensionCodec::try_decode()` expects `TaskContext` instead of `FunctionRegistry`: + +```diff +pub trait PhysicalExtensionCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], +- registry: &dyn FunctionRegistry, ++ ctx: &TaskContext, + ) -> Result>; +``` + +See [issue #17601] for more details. + +[issue #17601]: https://github.com/apache/datafusion/issues/17601 + +## DataFusion `50.0.0` ### ListingTable automatically detects Hive Partitioned tables diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index f00c4c2acc..4d0b897648 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -88,6 +88,7 @@ The following configuration settings are available: | datafusion.execution.parquet.binary_as_string | false | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead. | | datafusion.execution.parquet.coerce_int96 | NULL | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution. | | datafusion.execution.parquet.bloom_filter_on_read | true | (reading) Use any available bloom filters when reading parquet files | +| datafusion.execution.parquet.max_predicate_cache_size | NULL | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching. | | datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes | | datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" | diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index abf0286fa8..56e4369a9b 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -288,6 +288,7 @@ select log(-1), log(0), sqrt(-1); | Syntax | Description | | ------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | avg(expr) | Сalculates the average value for `expr`. | +| avg_distinct(expr) | Creates an expression to represent the avg(distinct) aggregate function | | approx_distinct(expr) | Calculates an approximate count of the number of distinct values for `expr`. | | approx_median(expr) | Calculates an approximation of the median for `expr`. | | approx_percentile_cont(expr, percentile [, centroids]) | Calculates an approximation of the specified `percentile` for `expr`. Optional `centroids` parameter controls accuracy (default: 100). | @@ -298,7 +299,7 @@ select log(-1), log(0), sqrt(-1); | bool_and(expr) | Returns true if all non-null input values (`expr`) are true, otherwise false. | | bool_or(expr) | Returns true if any non-null input value (`expr`) is true, otherwise false. | | count(expr) | Returns the number of rows for `expr`. | -| count_distinct | Creates an expression to represent the count(distinct) aggregate function | +| count_distinct(expr) | Creates an expression to represent the count(distinct) aggregate function | | cube(exprs) | Creates a grouping set for all combination of `exprs` | | grouping_set(exprs) | Create a grouping set. | | max(expr) | Finds the maximum value of `expr`. | @@ -306,6 +307,7 @@ select log(-1), log(0), sqrt(-1); | min(expr) | Finds the minimum value of `expr`. | | rollup(exprs) | Creates a grouping set for rollup sets. | | sum(expr) | Сalculates the sum of `expr`. | +| sum_distinct(expr) | Creates an expression to represent the sum(distinct) aggregate function | ## Aggregate Function Builder diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index 9bb98a19ee..ef82de9a24 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -96,46 +96,50 @@ Here are some active projects using DataFusion: - [Arroyo](https://github.com/ArroyoSystems/arroyo) Distributed stream processing engine in Rust - [ArkFlow](https://github.com/arkflow-rs/arkflow) High-performance Rust stream processing engine -- [Auron](https://github.com/apache/auron) The Auron accelerator for big data engine (e.g., Spark, Flink) leverages native vectorized execution to accelerate query processing -- [Ballista](https://github.com/apache/datafusion-ballista) Distributed SQL Query Engine -- [CnosDB](https://github.com/cnosdb/cnosdb) Open Source Distributed Time Series Database +- [Auron] The Auron accelerator for big data engine (e.g., Spark, Flink) leverages native vectorized execution to accelerate query processing +- [Ballista] Distributed SQL Query Engine +- [CnosDB] Open Source Distributed Time Series Database - [Comet](https://github.com/apache/datafusion-comet) Apache Spark native query execution plugin -- [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) Cube’s universal semantic layer platform is the next evolution of OLAP technology for AI, BI, spreadsheets, and embedded analytics -- [Dask SQL](https://github.com/dask-contrib/dask-sql) Distributed SQL query engine in Python +- [Cube Store] Cube’s universal semantic layer platform is the next evolution of OLAP technology for AI, BI, spreadsheets, and embedded analytics +- [Dask SQL] Distributed SQL query engine in Python - [datafusion-dft](https://github.com/datafusion-contrib/datafusion-dft) Batteries included CLI, TUI, and server implementations for DataFusion. -- [delta-rs](https://github.com/delta-io/delta-rs) Native Rust implementation of Delta Lake +- [dbt Fusion engine](https://github.com/dbt-labs/dbt-fusion) The dbt Fusion engine, written in Rust, designed for speed and correctness with a native SQL understanding across DWH SQL dialects. +- [delta-rs] Native Rust implementation of Delta Lake - [Exon](https://github.com/wheretrue/exon) Analysis toolkit for life-science applications - [Feldera](https://github.com/feldera/feldera) Fast query engine for incremental computation - [Funnel](https://funnel.io/) Data Platform powering Marketing Intelligence applications. - [GlareDB](https://github.com/GlareDB/glaredb) Fast SQL database for querying and analyzing distributed data. -- [GreptimeDB](https://github.com/GreptimeTeam/greptimedb) Open Source & Cloud Native Distributed Time Series Database -- [HoraeDB](https://github.com/apache/incubator-horaedb) Distributed Time-Series Database +- [GreptimeDB] Open Source & Cloud Native Distributed Time Series Database +- [HoraeDB] Distributed Time-Series Database - [Iceberg-rust](https://github.com/apache/iceberg-rust) Rust implementation of Apache Iceberg -- [InfluxDB](https://github.com/influxdata/influxdb) Time Series Database -- [Kamu](https://github.com/kamu-data/kamu-cli/) Planet-scale streaming data pipeline +- [InfluxDB] Time Series Database +- [Kamu] Planet-scale streaming data pipeline - [LakeSoul](https://github.com/lakesoul-io/LakeSoul) Open source LakeHouse framework with native IO in Rust. - [Lance](https://github.com/lancedb/lance) Modern columnar data format for ML -- [OpenObserve](https://github.com/openobserve/openobserve) Distributed cloud native observability platform +- [OpenObserve] Distributed cloud native observability platform - [ParadeDB](https://github.com/paradedb/paradedb) PostgreSQL for Search & Analytics -- [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform +- [Parseable] Log storage and observability platform - [Polygon.io](https://polygon.io/) Stock Market API -- [qv](https://github.com/timvw/qv) Quickly view your data +- [qv] Quickly view your data +- [R2 Query Engine](https://blog.cloudflare.com/r2-sql-deep-dive/) Cloudflare's distributed engine for querying data in Iceberg Catalogs - [Restate](https://github.com/restatedev) Easily build resilient applications using distributed durable async/await -- [ROAPI](https://github.com/roapi/roapi) Create full-fledged APIs for slowly moving datasets without writing a single line of code +- [ROAPI] Create full-fledged APIs for slowly moving datasets without writing a single line of code - [Sail](https://github.com/lakehq/sail) Unifying stream, batch and AI workloads with Apache Spark compatibility -- [Seafowl](https://github.com/splitgraph/seafowl) CDN-friendly analytical database +- [Seafowl] CDN-friendly analytical database +- [SedonaDB](https://github.com/apache/sedona-db) A single-node analytical database engine with geospatial as a first-class citizen - [Sleeper](https://github.com/gchq/sleeper) Serverless, cloud-native, log-structured merge tree based, scalable key-value store -- [Spice.ai](https://github.com/spiceai/spiceai) Building blocks for data-driven AI applications -- [Synnada](https://synnada.ai/) Streaming-first framework for data products -- [VegaFusion](https://vegafusion.io/) Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar +- [Spice.ai] Building blocks for data-driven AI applications +- [Synnada] Streaming-first framework for data products +- [VegaFusion] Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar - [Telemetry](https://telemetry.sh/) Structured logging made easy +- [Xorq](https://github.com/xorq-labs/xorq/) Xorq is a multi-engine batch transformation framework built on Ibis, DataFusion and Arrow Here are some less active projects that used DataFusion: - [bdt](https://github.com/datafusion-contrib/bdt) Boring Data Tool -- [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -- [Flock](https://github.com/flock-lab/flock) -- [Tensorbase](https://github.com/tensorbase/tensorbase) +- [Cloudfuse Buzz] +- [Flock] +- [Tensorbase] [ballista]: https://github.com/apache/datafusion-ballista [auron]: https://github.com/apache/auron @@ -147,7 +151,7 @@ Here are some less active projects that used DataFusion: [delta-rs]: https://github.com/delta-io/delta-rs [flock]: https://github.com/flock-lab/flock [kamu]: https://github.com/kamu-data/kamu-cli -[greptime db]: https://github.com/GreptimeTeam/greptimedb +[greptimedb]: https://github.com/GreptimeTeam/greptimedb [horaedb]: https://github.com/apache/incubator-horaedb [influxdb]: https://github.com/influxdata/influxdb [openobserve]: https://github.com/openobserve/openobserve diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index b0811ab781..4a1069d4fd 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -81,6 +81,17 @@ abs(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT abs(-5); ++----------+ +| abs(-5) | ++----------+ +| 5 | ++----------+ +``` + ### `acos` Returns the arc cosine or inverse cosine of a number. @@ -93,6 +104,17 @@ acos(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT acos(1); ++----------+ +| acos(1) | ++----------+ +| 0.0 | ++----------+ +``` + ### `acosh` Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number. @@ -105,6 +127,17 @@ acosh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT acosh(2); ++------------+ +| acosh(2) | ++------------+ +| 1.31696 | ++------------+ +``` + ### `asin` Returns the arc sine or inverse sine of a number. @@ -117,6 +150,17 @@ asin(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT asin(0.5); ++------------+ +| asin(0.5) | ++------------+ +| 0.5235988 | ++------------+ +``` + ### `asinh` Returns the area hyperbolic sine or inverse hyperbolic sine of a number. @@ -129,6 +173,17 @@ asinh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT asinh(1); ++------------+ +| asinh(1) | ++------------+ +| 0.8813736 | ++------------+ +``` + ### `atan` Returns the arc tangent or inverse tangent of a number. @@ -141,6 +196,17 @@ atan(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql + > SELECT atan(1); ++-----------+ +| atan(1) | ++-----------+ +| 0.7853982 | ++-----------+ +``` + ### `atan2` Returns the arc tangent or inverse tangent of `expression_y / expression_x`. @@ -156,6 +222,17 @@ atan2(expression_y, expression_x) - **expression_x**: Second numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +#### Example + +```sql +> SELECT atan2(1, 1); ++------------+ +| atan2(1,1) | ++------------+ +| 0.7853982 | ++------------+ +``` + ### `atanh` Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number. @@ -168,6 +245,17 @@ atanh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql + > SELECT atanh(0.5); ++-------------+ +| atanh(0.5) | ++-------------+ +| 0.5493061 | ++-------------+ +``` + ### `cbrt` Returns the cube root of a number. @@ -180,6 +268,17 @@ cbrt(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT cbrt(27); ++-----------+ +| cbrt(27) | ++-----------+ +| 3.0 | ++-----------+ +``` + ### `ceil` Returns the nearest integer greater than or equal to a number. @@ -192,6 +291,17 @@ ceil(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql + > SELECT ceil(3.14); ++------------+ +| ceil(3.14) | ++------------+ +| 4.0 | ++------------+ +``` + ### `cos` Returns the cosine of a number. @@ -204,6 +314,17 @@ cos(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT cos(0); ++--------+ +| cos(0) | ++--------+ +| 1.0 | ++--------+ +``` + ### `cosh` Returns the hyperbolic cosine of a number. @@ -216,6 +337,17 @@ cosh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT cosh(1); ++-----------+ +| cosh(1) | ++-----------+ +| 1.5430806 | ++-----------+ +``` + ### `cot` Returns the cotangent of a number. @@ -228,6 +360,17 @@ cot(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT cot(1); ++---------+ +| cot(1) | ++---------+ +| 0.64209 | ++---------+ +``` + ### `degrees` Converts radians to degrees. @@ -240,6 +383,17 @@ degrees(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql + > SELECT degrees(pi()); ++------------+ +| degrees(0) | ++------------+ +| 180.0 | ++------------+ +``` + ### `exp` Returns the base-e exponential of a number. @@ -252,6 +406,17 @@ exp(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT exp(1); ++---------+ +| exp(1) | ++---------+ +| 2.71828 | ++---------+ +``` + ### `factorial` Factorial. Returns 1 if value is less than 2. @@ -264,6 +429,17 @@ factorial(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT factorial(5); ++---------------+ +| factorial(5) | ++---------------+ +| 120 | ++---------------+ +``` + ### `floor` Returns the nearest integer less than or equal to a number. @@ -276,6 +452,17 @@ floor(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT floor(3.14); ++-------------+ +| floor(3.14) | ++-------------+ +| 3.0 | ++-------------+ +``` + ### `gcd` Returns the greatest common divisor of `expression_x` and `expression_y`. Returns 0 if both inputs are zero. @@ -289,6 +476,17 @@ gcd(expression_x, expression_y) - **expression_x**: First numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - **expression_y**: Second numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT gcd(48, 18); ++------------+ +| gcd(48,18) | ++------------+ +| 6 | ++------------+ +``` + ### `isnan` Returns true if a given number is +NaN or -NaN otherwise returns false. @@ -301,6 +499,17 @@ isnan(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT isnan(1); ++----------+ +| isnan(1) | ++----------+ +| false | ++----------+ +``` + ### `iszero` Returns true if a given number is +0.0 or -0.0 otherwise returns false. @@ -313,6 +522,17 @@ iszero(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT iszero(0); ++------------+ +| iszero(0) | ++------------+ +| true | ++------------+ +``` + ### `lcm` Returns the least common multiple of `expression_x` and `expression_y`. Returns 0 if either input is zero. @@ -326,6 +546,17 @@ lcm(expression_x, expression_y) - **expression_x**: First numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - **expression_y**: Second numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT lcm(4, 5); ++----------+ +| lcm(4,5) | ++----------+ +| 20 | ++----------+ +``` + ### `ln` Returns the natural logarithm of a number. @@ -338,6 +569,17 @@ ln(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT ln(2.71828); ++-------------+ +| ln(2.71828) | ++-------------+ +| 1.0 | ++-------------+ +``` + ### `log` Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number. @@ -352,6 +594,17 @@ log(numeric_expression) - **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT log(10); ++---------+ +| log(10) | ++---------+ +| 1.0 | ++---------+ +``` + ### `log10` Returns the base-10 logarithm of a number. @@ -364,6 +617,17 @@ log10(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT log10(100); ++-------------+ +| log10(100) | ++-------------+ +| 2.0 | ++-------------+ +``` + ### `log2` Returns the base-2 logarithm of a number. @@ -376,6 +640,17 @@ log2(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT log2(8); ++-----------+ +| log2(8) | ++-----------+ +| 3.0 | ++-----------+ +``` + ### `nanvl` Returns the first argument if it's not _NaN_. @@ -390,6 +665,17 @@ nanvl(expression_x, expression_y) - **expression_x**: Numeric expression to return if it's not _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators. - **expression_y**: Numeric expression to return if the first expression is _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators. +#### Example + +```sql +> SELECT nanvl(0, 5); ++------------+ +| nanvl(0,5) | ++------------+ +| 0 | ++------------+ +``` + ### `pi` Returns an approximate value of π. @@ -415,6 +701,17 @@ power(base, exponent) - **base**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - **exponent**: Exponent numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT power(2, 3); ++-------------+ +| power(2,3) | ++-------------+ +| 8 | ++-------------+ +``` + #### Aliases - pow @@ -431,6 +728,17 @@ radians(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT radians(180); ++----------------+ +| radians(180) | ++----------------+ +| 3.14159265359 | ++----------------+ +``` + ### `random` Returns a random float value in the range [0, 1). @@ -440,6 +748,17 @@ The random seed is unique to each row. random() ``` +#### Example + +```sql +> SELECT random(); ++------------------+ +| random() | ++------------------+ +| 0.7389238902938 | ++------------------+ +``` + ### `round` Rounds a number to the nearest integer. @@ -453,6 +772,17 @@ round(numeric_expression[, decimal_places]) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - **decimal_places**: Optional. The number of decimal places to round to. Defaults to 0. +#### Example + +```sql +> SELECT round(3.14159); ++--------------+ +| round(3.14159)| ++--------------+ +| 3.0 | ++--------------+ +``` + ### `signum` Returns the sign of a number. @@ -467,6 +797,17 @@ signum(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT signum(-42); ++-------------+ +| signum(-42) | ++-------------+ +| -1 | ++-------------+ +``` + ### `sin` Returns the sine of a number. @@ -479,6 +820,17 @@ sin(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT sin(0); ++----------+ +| sin(0) | ++----------+ +| 0.0 | ++----------+ +``` + ### `sinh` Returns the hyperbolic sine of a number. @@ -491,6 +843,17 @@ sinh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT sinh(1); ++-----------+ +| sinh(1) | ++-----------+ +| 1.1752012 | ++-----------+ +``` + ### `sqrt` Returns the square root of a number. @@ -515,6 +878,17 @@ tan(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql +> SELECT tan(pi()/4); ++--------------+ +| tan(PI()/4) | ++--------------+ +| 1.0 | ++--------------+ +``` + ### `tanh` Returns the hyperbolic tangent of a number. @@ -527,6 +901,17 @@ tanh(numeric_expression) - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +#### Example + +```sql + > SELECT tanh(20); + +----------+ + | tanh(20) | + +----------+ + | 1.0 | + +----------+ +``` + ### `trunc` Truncates a number to a whole number or truncated to the specified decimal places. @@ -544,6 +929,17 @@ trunc(numeric_expression[, decimal_places]) right of the decimal point. If `decimal_places` is a negative integer, replaces digits to the left of the decimal point with `0`. +#### Example + +```sql +> SELECT trunc(42.738); ++----------------+ +| trunc(42.738) | ++----------------+ +| 42 | ++----------------+ +``` + ## Conditional Functions - [coalesce](#coalesce) diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md index 39163cf492..87e940245b 100644 --- a/docs/source/user-guide/sql/select.md +++ b/docs/source/user-guide/sql/select.md @@ -40,6 +40,7 @@ DataFusion supports the following syntax for queries: [ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ]
[ [LIMIT](#limit-clause) count ]
[ [EXCLUDE | EXCEPT](#exclude-and-except-clause) ]
+[Pipe operators](#pipe-operators)
@@ -327,3 +328,98 @@ FROM table; SELECT * EXCLUDE(age, person) FROM table; ``` + +## Pipe operators + +Some SQL dialects (e.g. BigQuery) support the pipe operator `|>`. +The SQL dialect can be set like this: + +```sql +set datafusion.sql_parser.dialect = 'BigQuery'; +``` + +DataFusion currently supports the following pipe operators: + +- [WHERE](#pipe_where) +- [ORDER BY](#pipe_order_by) +- [LIMIT](#pipe_limit) +- [SELECT](#pipe_select) +- [EXTEND](#pipe_extend) + +(pipe_where)= + +### WHERE + +```sql +select * from range(0,10) +|> where value < 2; ++-------+ +| value | ++-------+ +| 0 | +| 1 | ++-------+ +``` + +(pipe_order_by)= + +### ORDER BY + +```sql +select * from range(0,3) +|> order by value desc; ++-------+ +| value | ++-------+ +| 2 | +| 1 | +| 0 | ++-------+ +``` + +(pipe_limit)= + +### LIMIT + +```sql +select * from range(0,3) +|> order by value desc +|> limit 1; ++-------+ +| value | ++-------+ +| 2 | ++-------+ +``` + +(pipe_select)= + +### SELECT + +```sql +select * from range(0,3) +|> select value + 10; ++---------------------------+ +| range().value + Int64(10) | ++---------------------------+ +| 10 | +| 11 | +| 12 | ++---------------------------+ +``` + +(pipe_extend)= + +### EXTEND + +```sql +select * from range(0,3) +|> extend -value AS minus_value; ++-------+-------------+ +| value | minus_value | ++-------+-------------+ +| 0 | 0 | +| 1 | -1 | +| 2 | -2 | ++-------+-------------+ +``` diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 55d572362d..7697bc1c1e 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -19,5 +19,5 @@ # to compile this workspace and run CI jobs. [toolchain] -channel = "1.89.0" +channel = "1.90.0" components = ["rustfmt", "clippy"] diff --git a/test-utils/src/array_gen/random_data.rs b/test-utils/src/array_gen/random_data.rs index 78518b7bf9..ea2b872f7d 100644 --- a/test-utils/src/array_gen/random_data.rs +++ b/test-utils/src/array_gen/random_data.rs @@ -17,12 +17,12 @@ use arrow::array::ArrowPrimitiveType; use arrow::datatypes::{ - i256, Date32Type, Date64Type, Decimal128Type, Decimal256Type, - DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, - DurationSecondType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, - Int8Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano, - IntervalMonthDayNanoType, IntervalYearMonthType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, + i256, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, + Decimal64Type, DurationMicrosecondType, DurationMillisecondType, + DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, IntervalDayTime, IntervalDayTimeType, + IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalYearMonthType, + Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; @@ -67,6 +67,8 @@ basic_random_data!(Time32MillisecondType); basic_random_data!(Time64MicrosecondType); basic_random_data!(Time64NanosecondType); basic_random_data!(IntervalYearMonthType); +basic_random_data!(Decimal32Type); +basic_random_data!(Decimal64Type); basic_random_data!(Decimal128Type); basic_random_data!(TimestampSecondType); basic_random_data!(TimestampMillisecondType); diff --git a/testing b/testing index d2a1371230..0d60ccae40 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d2a13712303498963395318a4eb42872e66aead7 +Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 diff --git a/typos.toml b/typos.toml index 46f21febcf..e4b57f5c14 100644 --- a/typos.toml +++ b/typos.toml @@ -42,5 +42,6 @@ extend-exclude = [ "*.sql", "dev/changelog/**", "benchmarks/**", - "*.csv" + "*.csv", + "docs/source/contributor-guide/governance.md" ]